Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 467.230us 2421.38% 467.230us 467.230us 1
torch_eager 10.72% 231.062us 99.69% 2.148ms 2.148ms 0.000us 0.00% 21.632us 21.632us 1
aten::to 0.58% 12.480us 78.88% 1.700ms 283.277us 0.000us 0.00% 14.336us 2.389us 6
aten::_to_copy 2.05% 44.092us 78.31% 1.687ms 281.197us 0.000us 0.00% 14.336us 2.389us 6
aten::copy_ 3.07% 66.050us 73.46% 1.583ms 263.783us 12.000us 62.19% 14.336us 2.389us 6
aten::conv1d 0.49% 10.600us 7.90% 170.164us 56.721us 0.000us 0.00% 7.296us 2.432us 3
aten::convolution 0.77% 16.490us 7.41% 159.564us 53.188us 0.000us 0.00% 7.296us 2.432us 3
aten::_convolution 1.64% 35.301us 6.64% 143.074us 47.691us 0.000us 0.00% 7.296us 2.432us 3
aten::_conv_depthwise2d 1.69% 36.381us 4.00% 86.271us 28.757us 7.296us 37.81% 7.296us 2.432us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.81% 7.296us 2.432us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.67% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.52% 5.696us 1.899us 3
Activity Buffer Request 66.85% 1.440ms 66.85% 1.440ms 1.440ms 2.336us 12.11% 2.336us 2.336us 1
aten::empty_strided 2.80% 60.390us 2.80% 60.390us 10.065us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 4.73% 101.823us 4.73% 101.823us 11.314us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.46% 31.451us 1.84% 39.731us 4.415us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.62% 13.289us 0.62% 13.289us 0.886us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.58% 12.560us 0.58% 12.560us 4.187us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.54% 11.740us 0.54% 11.740us 3.913us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.42% 8.963us 0.49% 10.602us 3.534us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.155ms
Self CUDA time total: 19.296us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.566us 1726.42% 337.566us 337.566us 1
torch_eager 6.86% 130.161us 99.69% 1.893ms 1.893ms 0.000us 0.00% 21.665us 21.665us 1
aten::to 0.32% 6.060us 85.13% 1.616ms 269.375us 0.000us 0.00% 13.729us 2.288us 6
aten::_to_copy 1.27% 24.100us 84.81% 1.610ms 268.365us 0.000us 0.00% 13.729us 2.288us 6
aten::copy_ 2.69% 51.011us 81.95% 1.556ms 259.305us 11.617us 59.41% 13.729us 2.288us 6
aten::conv1d 0.30% 5.740us 6.23% 118.253us 39.418us 0.000us 0.00% 7.936us 2.645us 3
aten::convolution 0.52% 9.902us 5.93% 112.513us 37.504us 0.000us 0.00% 7.936us 2.645us 3
aten::_convolution 1.21% 22.959us 5.40% 102.611us 34.204us 0.000us 0.00% 7.936us 2.645us 3
aten::_conv_depthwise2d 1.18% 22.461us 3.33% 63.161us 21.054us 7.936us 40.59% 7.936us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.59% 7.936us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.09% 6.080us 2.027us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537us 28.32% 5.537us 1.846us 3
Activity Buffer Request 76.56% 1.454ms 76.56% 1.454ms 1.454ms 2.112us 10.80% 2.112us 2.112us 1
aten::empty_strided 1.59% 30.260us 1.59% 30.260us 5.043us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.84% 72.993us 3.84% 72.993us 8.110us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.96% 18.220us 1.27% 24.051us 2.672us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 9.451us 0.50% 9.451us 0.630us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 9.960us 0.52% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.48% 9.030us 0.48% 9.030us 3.010us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.890us 0.39% 7.340us 2.447us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.899ms
Self CUDA time total: 19.553us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.726us 1795.19% 333.726us 333.726us 1
torch_eager 6.76% 126.472us 99.71% 1.865ms 1.865ms 0.000us 0.00% 20.510us 20.510us 1
aten::to 0.32% 5.970us 85.12% 1.592ms 265.378us 0.000us 0.00% 13.598us 2.266us 6
aten::_to_copy 1.26% 23.561us 84.80% 1.586ms 264.383us 0.000us 0.00% 13.598us 2.266us 6
aten::copy_ 2.75% 51.371us 81.92% 1.532ms 255.399us 11.678us 62.82% 13.598us 2.266us 6
aten::conv1d 0.31% 5.850us 6.37% 119.083us 39.694us 0.000us 0.00% 6.912us 2.304us 3
aten::convolution 0.54% 10.170us 6.05% 113.233us 37.744us 0.000us 0.00% 6.912us 2.304us 3
aten::_convolution 1.25% 23.320us 5.51% 103.063us 34.354us 0.000us 0.00% 6.912us 2.304us 3
aten::_conv_depthwise2d 1.20% 22.402us 3.41% 63.713us 21.238us 6.912us 37.18% 6.912us 2.304us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.912us 37.18% 6.912us 2.304us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.951us 32.01% 5.951us 1.984us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 30.81% 5.727us 1.909us 3
Activity Buffer Request 76.63% 1.433ms 76.63% 1.433ms 1.433ms 1.920us 10.33% 1.920us 1.920us 1
aten::empty_strided 1.62% 30.340us 1.62% 30.340us 5.057us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.76% 70.302us 3.76% 70.302us 7.811us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.94% 17.590us 1.23% 22.950us 2.550us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 8.970us 0.48% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.54% 10.051us 0.54% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 8.519us 0.46% 8.519us 2.840us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 5.980us 0.39% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.871ms
Self CUDA time total: 18.590us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.229us 1732.17% 339.229us 339.229us 1
torch_eager 6.09% 126.194us 99.75% 2.066ms 2.066ms 0.000us 0.00% 21.729us 21.729us 1
aten::to 0.29% 6.100us 86.58% 1.793ms 298.900us 0.000us 0.00% 14.018us 2.336us 6
aten::_to_copy 1.16% 23.990us 86.28% 1.787ms 297.883us 0.000us 0.00% 14.018us 2.336us 6
aten::copy_ 2.58% 53.448us 83.67% 1.733ms 288.850us 11.873us 60.63% 14.018us 2.336us 6
aten::conv1d 0.32% 6.580us 5.73% 118.763us 39.588us 0.000us 0.00% 7.711us 2.570us 3
aten::convolution 0.48% 9.870us 5.42% 112.183us 37.394us 0.000us 0.00% 7.711us 2.570us 3
aten::_convolution 1.09% 22.580us 4.94% 102.313us 34.104us 0.000us 0.00% 7.711us 2.570us 3
aten::_conv_depthwise2d 1.08% 22.411us 3.09% 64.033us 21.344us 7.711us 39.37% 7.711us 2.570us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.711us 39.37% 7.711us 2.570us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.145us 31.38% 6.145us 2.048us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3
Activity Buffer Request 69.66% 1.443ms 69.66% 1.443ms 1.443ms 2.145us 10.95% 2.145us 2.145us 1
aten::empty_strided 1.46% 30.210us 1.46% 30.210us 5.035us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.49% 258.686us 12.49% 258.686us 28.743us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 18.050us 1.12% 23.200us 2.578us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.42% 8.720us 0.42% 8.720us 0.581us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.49% 10.140us 0.49% 10.140us 3.380us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.442us 0.46% 9.442us 3.147us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.830us 0.35% 7.220us 2.407us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.071ms
Self CUDA time total: 19.584us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 342.208us 1399.74% 342.208us 342.208us 1
torch_eager 6.21% 125.160us 99.74% 2.012ms 2.012ms 0.000us 0.00% 26.720us 26.720us 1
aten::to 0.29% 5.910us 86.35% 1.742ms 290.270us 0.000us 0.00% 15.168us 2.528us 6
aten::_to_copy 1.25% 25.122us 86.06% 1.736ms 289.285us 0.000us 0.00% 15.168us 2.528us 6
aten::copy_ 2.93% 59.190us 83.27% 1.679ms 279.905us 12.896us 52.75% 15.168us 2.528us 6
aten::conv1d 0.28% 5.620us 5.81% 117.132us 39.044us 0.000us 0.00% 11.552us 3.851us 3
aten::convolution 0.49% 9.910us 5.53% 111.512us 37.171us 0.000us 0.00% 11.552us 3.851us 3
aten::_convolution 1.15% 23.280us 5.04% 101.602us 33.867us 0.000us 0.00% 11.552us 3.851us 3
aten::_conv_depthwise2d 1.09% 21.990us 3.08% 62.201us 20.734us 11.552us 47.25% 11.552us 3.851us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.25% 11.552us 3.851us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 27.09% 6.624us 2.208us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 25.65% 6.272us 2.091us 3
Activity Buffer Request 71.09% 1.434ms 71.09% 1.434ms 1.434ms 2.272us 9.29% 2.272us 2.272us 1
aten::empty_strided 1.55% 31.162us 1.55% 31.162us 5.194us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.29% 207.543us 10.29% 207.543us 23.060us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.90% 18.220us 1.17% 23.681us 2.631us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 8.971us 0.44% 8.971us 0.598us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.49% 9.951us 0.49% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.230us 0.46% 9.230us 3.077us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.780us 0.35% 7.150us 2.383us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.017ms
Self CUDA time total: 24.448us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 360.702us 1391.60% 360.702us 360.702us 1
torch_eager 7.02% 142.940us 99.74% 2.030ms 2.030ms 0.000us 0.00% 28.128us 28.128us 1
aten::to 0.30% 6.030us 85.23% 1.734ms 289.050us 0.000us 0.00% 15.136us 2.523us 6
aten::_to_copy 1.18% 23.913us 84.93% 1.728ms 288.045us 0.000us 0.00% 15.136us 2.523us 6
aten::copy_ 2.60% 52.858us 82.24% 1.673ms 278.911us 12.928us 49.88% 15.136us 2.523us 6
aten::conv1d 0.29% 5.931us 6.05% 123.062us 41.021us 0.000us 0.00% 12.992us 4.331us 3
aten::convolution 0.49% 10.049us 5.76% 117.131us 39.044us 0.000us 0.00% 12.992us 4.331us 3
aten::_convolution 1.15% 23.381us 5.26% 107.082us 35.694us 0.000us 0.00% 12.992us 4.331us 3
aten::_conv_depthwise2d 1.11% 22.652us 3.33% 67.801us 22.600us 12.992us 50.12% 12.992us 4.331us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 50.12% 12.992us 4.331us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.43% 6.592us 2.197us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.44% 6.336us 2.112us 3
Activity Buffer Request 70.88% 1.442ms 70.88% 1.442ms 1.442ms 2.208us 8.52% 2.208us 2.208us 1
aten::empty_strided 1.52% 30.891us 1.52% 30.891us 5.148us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.00% 203.394us 10.00% 203.394us 22.599us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.92% 18.741us 1.20% 24.361us 2.707us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.46% 9.330us 0.46% 9.330us 0.622us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.51% 10.450us 0.51% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.490us 0.47% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.900us 0.36% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.035ms
Self CUDA time total: 25.920us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 369.628us 962.57% 369.628us 369.628us 1
torch_eager 7.12% 161.009us 99.76% 2.255ms 2.255ms 0.000us 0.00% 40.960us 40.960us 1
aten::conv1d 0.32% 7.222us 5.82% 131.613us 43.871us 0.000us 0.00% 22.528us 7.509us 3
aten::convolution 0.54% 12.229us 5.50% 124.391us 41.464us 0.000us 0.00% 22.528us 7.509us 3
aten::_convolution 1.15% 26.031us 4.96% 112.162us 37.387us 0.000us 0.00% 22.528us 7.509us 3
aten::_conv_depthwise2d 1.09% 24.630us 3.00% 67.820us 22.607us 22.528us 58.67% 22.528us 7.509us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 58.67% 22.528us 7.509us 3
aten::to 0.34% 7.671us 85.42% 1.931ms 321.787us 0.000us 0.00% 18.432us 3.072us 6
aten::_to_copy 1.41% 31.890us 85.08% 1.923ms 320.509us 0.000us 0.00% 18.432us 3.072us 6
aten::copy_ 2.64% 59.711us 82.13% 1.856ms 309.384us 15.872us 41.33% 18.432us 3.072us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.544us 22.25% 8.544us 2.848us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.08% 7.328us 2.443us 3
Activity Buffer Request 64.20% 1.451ms 64.20% 1.451ms 1.451ms 2.560us 6.67% 2.560us 2.560us 1
aten::empty_strided 1.54% 34.861us 1.54% 34.861us 5.810us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 16.32% 368.786us 16.32% 368.786us 40.976us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.93% 20.991us 1.15% 26.100us 2.900us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.41% 9.319us 0.41% 9.319us 0.621us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.44% 9.850us 0.44% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 9.970us 0.44% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 7.041us 0.38% 8.701us 2.900us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.260ms
Self CUDA time total: 38.400us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.007us 838.09% 343.007us 343.007us 1
torch_eager 6.47% 141.163us 99.73% 2.175ms 2.175ms 0.000us 0.00% 43.487us 43.487us 1
aten::conv1d 0.27% 5.870us 5.52% 120.313us 40.104us 0.000us 0.00% 25.376us 8.459us 3
aten::convolution 0.46% 10.120us 5.25% 114.443us 38.148us 0.000us 0.00% 25.376us 8.459us 3
aten::_convolution 1.12% 24.490us 4.78% 104.323us 34.774us 0.000us 0.00% 25.376us 8.459us 3
aten::_conv_depthwise2d 1.00% 21.702us 2.89% 62.963us 20.988us 25.376us 62.00% 25.376us 8.459us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.376us 62.00% 25.376us 8.459us 3
aten::to 0.28% 6.129us 86.46% 1.885ms 314.232us 0.000us 0.00% 18.111us 3.018us 6
aten::_to_copy 1.13% 24.640us 86.18% 1.879ms 313.211us 0.000us 0.00% 18.111us 3.018us 6
aten::copy_ 2.51% 54.672us 83.58% 1.823ms 303.754us 15.551us 38.00% 18.111us 3.018us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.224us 20.09% 8.224us 2.741us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 17.90% 7.327us 2.442us 3
Activity Buffer Request 66.59% 1.452ms 66.59% 1.452ms 1.452ms 2.560us 6.26% 2.560us 2.560us 1
aten::empty_strided 1.47% 32.100us 1.47% 32.100us 5.350us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 15.50% 338.007us 15.50% 338.007us 37.556us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.84% 18.320us 1.10% 24.070us 2.674us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.43% 9.420us 0.43% 9.420us 0.628us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.46% 10.080us 0.46% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.42% 9.080us 0.42% 9.080us 3.027us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 5.960us 0.34% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.181ms
Self CUDA time total: 40.927us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.004us 357.73% 367.004us 367.004us 1
torch_eager 6.17% 126.763us 99.73% 2.049ms 2.049ms 0.000us 0.00% 108.512us 108.512us 1
aten::conv1d 0.28% 5.761us 5.81% 119.372us 39.791us 0.000us 0.00% 70.432us 23.477us 3
aten::convolution 0.48% 9.820us 5.53% 113.611us 37.870us 0.000us 0.00% 70.432us 23.477us 3
aten::_convolution 1.11% 22.788us 5.05% 103.791us 34.597us 0.000us 0.00% 70.432us 23.477us 3
aten::_conv_depthwise2d 1.12% 22.910us 3.14% 64.601us 21.534us 70.432us 68.65% 70.432us 23.477us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.432us 68.65% 70.432us 23.477us 3
aten::to 0.30% 6.130us 86.37% 1.774ms 295.680us 0.000us 0.00% 38.080us 6.347us 6
aten::_to_copy 2.18% 44.819us 86.07% 1.768ms 294.658us 0.000us 0.00% 38.080us 6.347us 6
aten::copy_ 2.56% 52.622us 82.32% 1.691ms 281.815us 32.160us 31.35% 38.080us 6.347us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.504us 17.06% 17.504us 5.835us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.656us 14.29% 14.656us 4.885us 3
Activity Buffer Request 69.77% 1.433ms 69.77% 1.433ms 1.433ms 5.920us 5.77% 5.920us 5.920us 1
aten::empty_strided 1.57% 32.241us 1.57% 32.241us 5.373us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.08% 227.645us 11.08% 227.645us 25.294us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 17.849us 1.12% 23.070us 2.563us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.030us 0.44% 9.030us 0.602us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.49% 10.050us 0.49% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 9.040us 0.44% 9.040us 3.013us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.163us 0.38% 7.782us 2.594us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.054ms
Self CUDA time total: 102.592us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.959us 299.49% 336.959us 336.959us 1
torch_eager 6.25% 125.522us 99.75% 2.004ms 2.004ms 0.000us 0.00% 118.493us 118.493us 1
aten::conv1d 0.38% 7.700us 5.98% 120.223us 40.074us 0.000us 0.00% 80.479us 26.826us 3
aten::convolution 0.49% 9.780us 5.60% 112.523us 37.508us 0.000us 0.00% 80.479us 26.826us 3
aten::_convolution 1.13% 22.669us 5.11% 102.743us 34.248us 0.000us 0.00% 80.479us 26.826us 3
aten::_conv_depthwise2d 1.12% 22.452us 3.19% 64.073us 21.358us 80.479us 71.53% 80.479us 26.826us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.479us 71.53% 80.479us 26.826us 3
aten::to 0.29% 5.910us 86.14% 1.731ms 288.442us 0.000us 0.00% 38.014us 6.336us 6
aten::_to_copy 1.19% 24.001us 85.85% 1.725ms 287.457us 0.000us 0.00% 38.014us 6.336us 6
aten::copy_ 2.56% 51.481us 83.17% 1.671ms 278.473us 32.031us 28.47% 38.014us 6.336us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 15.61% 17.567us 5.856us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 12.86% 14.464us 4.821us 3
Activity Buffer Request 71.72% 1.441ms 71.72% 1.441ms 1.441ms 5.983us 5.32% 5.983us 5.983us 1
aten::empty_strided 1.49% 29.901us 1.49% 29.901us 4.983us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.00% 200.814us 10.00% 200.814us 22.313us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 17.861us 1.15% 23.111us 2.568us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 8.970us 0.45% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.169us 0.46% 9.169us 3.056us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.030us 0.38% 7.560us 2.520us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.009ms
Self CUDA time total: 112.510us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.98% 122.945us 97.76% 2.011ms 2.011ms 0.000us 0.00% 433.437us 433.437us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.709us 107.83% 423.709us 423.709us 1
aten::conv1d 0.28% 5.760us 5.73% 117.851us 39.284us 0.000us 0.00% 250.941us 83.647us 3
aten::convolution 0.48% 9.830us 5.45% 112.091us 37.364us 0.000us 0.00% 250.941us 83.647us 3
aten::_convolution 1.12% 23.111us 4.97% 102.261us 34.087us 0.000us 0.00% 250.941us 83.647us 3
aten::_conv_depthwise2d 1.03% 21.200us 3.03% 62.360us 20.787us 250.941us 63.86% 250.941us 83.647us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 250.941us 63.86% 250.941us 83.647us 3
aten::to 0.28% 5.851us 84.70% 1.742ms 290.313us 0.000us 0.00% 182.496us 30.416us 6
aten::_to_copy 1.16% 23.919us 84.41% 1.736ms 289.338us 0.000us 0.00% 182.496us 30.416us 6
aten::copy_ 2.53% 51.981us 81.78% 1.682ms 280.333us 142.016us 36.14% 182.496us 30.416us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 101.952us 25.94% 101.952us 33.984us 3
Activity Buffer Request 70.64% 1.453ms 70.64% 1.453ms 1.453ms 40.480us 10.30% 40.480us 40.480us 1
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.064us 10.20% 40.064us 13.355us 3
aten::empty_strided 1.46% 30.112us 1.46% 30.112us 5.019us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.67% 198.853us 9.67% 198.853us 22.095us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.91% 18.669us 1.18% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.151us 0.44% 9.151us 0.610us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.870us 0.48% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.710us 0.47% 9.710us 3.237us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.960us 0.36% 7.350us 2.450us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.057ms
Self CUDA time total: 392.957us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.86% 122.119us 95.18% 1.984ms 1.984ms 0.000us 0.00% 485.373us 485.373us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 475.549us 106.61% 475.549us 475.549us 1
aten::conv1d 0.29% 6.020us 5.58% 116.291us 38.764us 0.000us 0.00% 298.429us 99.476us 3
aten::convolution 0.46% 9.580us 5.29% 110.271us 36.757us 0.000us 0.00% 298.429us 99.476us 3
aten::_convolution 1.07% 22.391us 4.83% 100.691us 33.564us 0.000us 0.00% 298.429us 99.476us 3
aten::_conv_depthwise2d 1.02% 21.160us 3.01% 62.730us 20.910us 298.429us 66.91% 298.429us 99.476us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.429us 66.91% 298.429us 99.476us 3
aten::to 0.28% 5.929us 82.40% 1.718ms 286.300us 0.000us 0.00% 186.944us 31.157us 6
aten::_to_copy 1.13% 23.472us 82.12% 1.712ms 285.312us 0.000us 0.00% 186.944us 31.157us 6
aten::copy_ 2.45% 51.061us 79.57% 1.659ms 276.443us 147.616us 33.09% 186.944us 31.157us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.712us 24.15% 107.712us 35.904us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.904us 8.95% 39.904us 13.301us 3
Activity Buffer Request 68.65% 1.431ms 68.65% 1.431ms 1.431ms 39.328us 8.82% 39.328us 39.328us 1
aten::empty_strided 1.43% 29.742us 1.43% 29.742us 4.957us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.54% 198.903us 9.54% 198.903us 22.100us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.85% 17.731us 1.11% 23.210us 2.579us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.210us 0.44% 9.210us 0.614us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.850us 0.47% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.45% 9.320us 0.45% 9.320us 3.107us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.850us 0.35% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.085ms
Self CUDA time total: 446.045us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.833us 1729.88% 323.833us 323.833us 1
torch_eager 14.51% 116.191us 99.37% 795.884us 795.884us 0.000us 0.00% 20.608us 20.608us 1
aten::to 0.75% 6.009us 67.15% 537.870us 89.645us 0.000us 0.00% 13.376us 2.229us 6
aten::_to_copy 2.93% 23.471us 66.40% 531.861us 88.644us 0.000us 0.00% 13.376us 2.229us 6
aten::copy_ 6.32% 50.599us 59.65% 477.769us 79.628us 11.488us 61.37% 13.376us 2.229us 6
aten::conv1d 0.81% 6.510us 14.38% 115.173us 38.391us 0.000us 0.00% 7.232us 2.411us 3
aten::convolution 1.28% 10.221us 13.57% 108.663us 36.221us 0.000us 0.00% 7.232us 2.411us 3
aten::_convolution 2.73% 21.890us 12.29% 98.442us 32.814us 0.000us 0.00% 7.232us 2.411us 3
aten::_conv_depthwise2d 2.76% 22.080us 7.70% 61.700us 20.567us 7.232us 38.63% 7.232us 2.411us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.45% 5.888us 1.963us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 29.91% 5.600us 1.867us 3
Activity Buffer Request 31.20% 249.924us 31.20% 249.924us 249.924us 1.888us 10.09% 1.888us 1.888us 1
aten::empty_strided 3.82% 30.621us 3.82% 30.621us 5.103us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.75% 198.236us 24.75% 198.236us 22.026us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.09% 16.762us 2.71% 21.692us 2.410us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.04% 8.330us 1.04% 8.330us 0.555us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.15% 9.220us 1.15% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.17% 9.410us 1.17% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 5.800us 0.89% 7.160us 2.387us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 800.944us
Self CUDA time total: 18.720us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.666us 1676.91% 324.666us 324.666us 1
torch_eager 15.17% 119.302us 99.37% 781.483us 781.483us 0.000us 0.00% 21.249us 21.249us 1
aten::to 0.72% 5.648us 65.85% 517.928us 86.321us 0.000us 0.00% 13.345us 2.224us 6
aten::_to_copy 2.87% 22.611us 65.14% 512.280us 85.380us 0.000us 0.00% 13.345us 2.224us 6
aten::copy_ 6.22% 48.900us 58.49% 460.037us 76.673us 11.457us 59.18% 13.345us 2.224us 6
aten::conv1d 0.87% 6.869us 14.99% 117.911us 39.304us 0.000us 0.00% 7.904us 2.635us 3
aten::convolution 1.27% 10.002us 14.12% 111.042us 37.014us 0.000us 0.00% 7.904us 2.635us 3
aten::_convolution 2.89% 22.710us 12.85% 101.040us 33.680us 0.000us 0.00% 7.904us 2.635us 3
aten::_conv_depthwise2d 2.75% 21.590us 8.00% 62.920us 20.973us 7.904us 40.82% 7.904us 2.635us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.82% 7.904us 2.635us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 30.09% 5.825us 1.942us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3
Activity Buffer Request 30.25% 237.875us 30.25% 237.875us 237.875us 1.888us 9.75% 1.888us 1.888us 1
aten::empty_strided 3.77% 29.632us 3.77% 29.632us 4.939us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.87% 195.612us 24.87% 195.612us 21.735us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.14% 16.821us 2.78% 21.881us 2.431us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.08% 8.481us 1.08% 8.481us 0.565us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.19% 9.380us 1.19% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.75% 5.869us 0.93% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 786.473us
Self CUDA time total: 19.361us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.865us 1704.41% 328.865us 328.865us 1
torch_eager 14.92% 117.622us 99.37% 783.184us 783.184us 0.000us 0.00% 21.439us 21.439us 1
aten::to 0.74% 5.810us 66.49% 524.079us 87.347us 0.000us 0.00% 14.207us 2.368us 6
aten::_to_copy 3.01% 23.701us 65.75% 518.269us 86.378us 0.000us 0.00% 14.207us 2.368us 6
aten::copy_ 6.49% 51.190us 58.71% 462.718us 77.120us 12.063us 62.52% 14.207us 2.368us 6
aten::conv1d 0.75% 5.890us 14.60% 115.093us 38.364us 0.000us 0.00% 7.232us 2.411us 3
aten::convolution 1.22% 9.630us 13.86% 109.203us 36.401us 0.000us 0.00% 7.232us 2.411us 3
aten::_convolution 2.83% 22.270us 12.63% 99.573us 33.191us 0.000us 0.00% 7.232us 2.411us 3
aten::_conv_depthwise2d 2.80% 22.070us 7.82% 61.673us 20.558us 7.232us 37.48% 7.232us 2.411us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 37.48% 7.232us 2.411us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 32.34% 6.240us 2.080us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.823us 30.18% 5.823us 1.941us 3
Activity Buffer Request 29.70% 234.095us 29.70% 234.095us 234.095us 2.144us 11.11% 2.144us 2.144us 1
aten::empty_strided 4.04% 31.850us 4.04% 31.850us 5.308us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.25% 199.015us 25.25% 199.015us 22.113us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.15% 16.950us 2.78% 21.920us 2.436us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.05% 8.280us 1.05% 8.280us 0.552us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.07% 8.421us 1.07% 8.421us 2.807us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.76% 5.960us 0.92% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 788.184us
Self CUDA time total: 19.295us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.745us 1665.90% 334.745us 334.745us 1
torch_eager 14.26% 118.712us 99.40% 827.395us 827.395us 0.000us 0.00% 22.270us 22.270us 1
aten::to 0.70% 5.840us 67.41% 561.119us 93.520us 0.000us 0.00% 14.335us 2.389us 6
aten::_to_copy 2.86% 23.780us 66.71% 555.279us 92.546us 0.000us 0.00% 14.335us 2.389us 6
aten::copy_ 6.22% 51.741us 60.26% 501.588us 83.598us 12.159us 60.51% 14.335us 2.389us 6
aten::conv1d 0.81% 6.751us 14.52% 120.873us 40.291us 0.000us 0.00% 7.935us 2.645us 3
aten::convolution 1.20% 9.989us 13.71% 114.122us 38.041us 0.000us 0.00% 7.935us 2.645us 3
aten::_convolution 2.78% 23.181us 12.51% 104.133us 34.711us 0.000us 0.00% 7.935us 2.645us 3
aten::_conv_depthwise2d 2.64% 22.000us 7.72% 64.243us 21.414us 7.935us 39.49% 7.935us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.49% 7.935us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.239us 31.05% 6.239us 2.080us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.46% 5.920us 1.973us 3
Activity Buffer Request 32.59% 271.245us 32.59% 271.245us 271.245us 2.176us 10.83% 2.176us 2.176us 1
aten::empty_strided 3.59% 29.911us 3.59% 29.911us 4.985us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.22% 201.614us 24.22% 201.614us 22.402us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.06% 17.131us 2.68% 22.291us 2.477us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.07% 8.900us 1.07% 8.900us 0.593us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.16% 9.640us 1.16% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.15% 9.591us 1.15% 9.591us 3.197us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.79% 6.549us 0.97% 8.109us 2.703us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 832.395us
Self CUDA time total: 20.094us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.142us 918.64% 330.142us 330.142us 1
torch_eager 14.68% 120.212us 99.34% 813.674us 813.674us 0.000us 0.00% 38.530us 38.530us 1
aten::conv1d 0.79% 6.500us 14.15% 115.923us 38.641us 0.000us 0.00% 20.161us 6.720us 3
aten::convolution 1.18% 9.650us 13.36% 109.423us 36.474us 0.000us 0.00% 20.161us 6.720us 3
aten::_convolution 2.75% 22.509us 12.18% 99.773us 33.258us 0.000us 0.00% 20.161us 6.720us 3
aten::_conv_depthwise2d 2.55% 20.922us 7.56% 61.883us 20.628us 20.161us 56.10% 20.161us 6.720us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.161us 56.10% 20.161us 6.720us 3
aten::to 0.72% 5.880us 67.15% 549.969us 91.661us 0.000us 0.00% 18.369us 3.061us 6
aten::_to_copy 2.82% 23.099us 66.43% 544.089us 90.682us 0.000us 0.00% 18.369us 3.061us 6
aten::copy_ 6.44% 52.723us 59.97% 491.160us 81.860us 15.777us 43.90% 18.369us 3.061us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.51% 8.448us 2.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 20.39% 7.329us 2.443us 3
Activity Buffer Request 32.20% 263.764us 32.20% 263.764us 263.764us 2.592us 7.21% 2.592us 2.592us 1
aten::empty_strided 3.64% 29.830us 3.64% 29.830us 4.972us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.00% 196.543us 24.00% 196.543us 21.838us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.14% 17.540us 2.77% 22.711us 2.523us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.07% 8.761us 1.07% 8.761us 0.584us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.21% 9.871us 1.21% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.13% 9.220us 1.13% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.610us 0.85% 7.000us 2.333us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 819.054us
Self CUDA time total: 35.938us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.946us 872.79% 330.946us 330.946us 1
torch_eager 6.07% 120.841us 99.75% 1.987ms 1.987ms 0.000us 0.00% 40.478us 40.478us 1
aten::conv1d 0.33% 6.510us 5.92% 117.833us 39.278us 0.000us 0.00% 22.271us 7.424us 3
aten::convolution 0.49% 9.850us 5.59% 111.323us 37.108us 0.000us 0.00% 22.271us 7.424us 3
aten::_convolution 1.11% 22.181us 5.10% 101.473us 33.824us 0.000us 0.00% 22.271us 7.424us 3
aten::_conv_depthwise2d 1.10% 21.811us 3.17% 63.042us 21.014us 22.271us 58.73% 22.271us 7.424us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.271us 58.73% 22.271us 7.424us 3
aten::to 0.30% 5.981us 86.38% 1.720ms 286.727us 0.000us 0.00% 18.207us 3.034us 6
aten::_to_copy 1.18% 23.522us 86.08% 1.714ms 285.730us 0.000us 0.00% 18.207us 3.034us 6
aten::copy_ 2.55% 50.829us 83.41% 1.661ms 276.860us 15.647us 41.27% 18.207us 3.034us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.94% 8.320us 2.773us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 19.32% 7.327us 2.442us 3
Activity Buffer Request 72.02% 1.434ms 72.02% 1.434ms 1.434ms 2.560us 6.75% 2.560us 2.560us 1
aten::empty_strided 1.49% 29.700us 1.49% 29.700us 4.950us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.93% 197.835us 9.93% 197.835us 21.982us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.90% 17.980us 1.17% 23.390us 2.599us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 8.840us 0.44% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.50% 9.970us 0.50% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.410us 0.47% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 6.110us 0.38% 7.490us 2.497us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.992ms
Self CUDA time total: 37.918us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 385.308us 602.34% 385.308us 385.308us 1
torch_eager 14.42% 123.450us 99.41% 851.045us 851.045us 0.000us 0.00% 68.065us 68.065us 1
aten::conv1d 0.67% 5.711us 13.49% 115.513us 38.504us 0.000us 0.00% 41.633us 13.878us 3
aten::convolution 1.22% 10.470us 12.83% 109.802us 36.601us 0.000us 0.00% 41.633us 13.878us 3
aten::_convolution 2.63% 22.491us 11.60% 99.332us 33.111us 0.000us 0.00% 41.633us 13.878us 3
aten::_conv_depthwise2d 2.49% 21.351us 7.22% 61.852us 20.617us 41.633us 65.08% 41.633us 13.878us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.633us 65.08% 41.633us 13.878us 3
aten::to 0.71% 6.120us 68.08% 582.862us 97.144us 0.000us 0.00% 26.432us 4.405us 6
aten::_to_copy 2.87% 24.611us 67.37% 576.742us 96.124us 0.000us 0.00% 26.432us 4.405us 6
aten::copy_ 6.21% 53.173us 60.75% 520.070us 86.678us 22.336us 34.92% 26.432us 4.405us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.66% 11.936us 3.979us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 16.26% 10.400us 3.467us 3
Activity Buffer Request 28.33% 242.554us 28.33% 242.554us 242.554us 4.096us 6.40% 4.096us 4.096us 1
aten::empty_strided 3.74% 32.061us 3.74% 32.061us 5.344us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 28.79% 246.523us 28.79% 246.523us 27.391us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.02% 17.269us 2.63% 22.529us 2.503us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.08% 9.240us 1.08% 9.240us 0.616us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.11% 9.521us 1.11% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.03% 8.800us 1.03% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.830us 0.84% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 856.136us
Self CUDA time total: 63.969us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.859us 513.70% 357.859us 357.859us 1
torch_eager 20.53% 180.503us 99.40% 873.955us 873.955us 0.000us 0.00% 73.695us 73.695us 1
aten::conv1d 0.63% 5.530us 15.78% 138.703us 46.234us 0.000us 0.00% 47.359us 15.786us 3
aten::convolution 1.12% 9.840us 15.15% 133.173us 44.391us 0.000us 0.00% 47.359us 15.786us 3
aten::_convolution 2.65% 23.331us 14.03% 123.333us 41.111us 0.000us 0.00% 47.359us 15.786us 3
aten::_conv_depthwise2d 2.63% 23.161us 9.53% 83.782us 27.927us 47.359us 67.98% 47.359us 15.786us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.359us 67.98% 47.359us 15.786us 3
aten::to 0.72% 6.308us 59.85% 526.239us 87.707us 0.000us 0.00% 26.336us 4.389us 6
aten::_to_copy 2.80% 24.578us 59.14% 519.931us 86.655us 0.000us 0.00% 26.336us 4.389us 6
aten::copy_ 6.12% 53.792us 52.84% 464.590us 77.432us 22.304us 32.02% 26.336us 4.389us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.840us 17.00% 11.840us 3.947us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 15.02% 10.464us 3.488us 3
Activity Buffer Request 26.53% 233.244us 26.53% 233.244us 233.244us 4.032us 5.79% 4.032us 4.032us 1
aten::empty_strided 3.50% 30.763us 3.50% 30.763us 5.127us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 22.92% 201.494us 22.92% 201.494us 22.388us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.03% 17.891us 2.67% 23.440us 2.604us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.06% 9.339us 1.06% 9.339us 0.623us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 2.95% 25.971us 2.95% 25.971us 8.657us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.22% 10.710us 1.22% 10.710us 3.570us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 6.240us 0.88% 7.780us 2.593us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 879.215us
Self CUDA time total: 69.663us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 365.250us 197.10% 365.250us 365.250us 1
torch_eager 14.70% 119.032us 99.37% 804.604us 804.604us 0.000us 0.00% 195.299us 195.299us 1
aten::conv1d 0.95% 7.700us 17.22% 139.393us 46.464us 0.000us 0.00% 133.056us 44.352us 3
aten::convolution 1.24% 10.040us 16.26% 131.693us 43.898us 0.000us 0.00% 133.056us 44.352us 3
aten::_convolution 2.91% 23.550us 15.02% 121.653us 40.551us 0.000us 0.00% 133.056us 44.352us 3
aten::_conv_depthwise2d 2.69% 21.763us 10.08% 81.613us 27.204us 133.056us 71.80% 133.056us 44.352us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.056us 71.80% 133.056us 44.352us 3
aten::to 0.75% 6.042us 64.10% 518.999us 86.500us 0.000us 0.00% 62.243us 10.374us 6
aten::_to_copy 2.90% 23.470us 63.35% 512.957us 85.493us 0.000us 0.00% 62.243us 10.374us 6
aten::copy_ 6.35% 51.412us 56.59% 458.237us 76.373us 52.258us 28.20% 62.243us 10.374us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.250us 15.78% 29.250us 9.750us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 12.42% 23.008us 7.669us 3
Activity Buffer Request 28.43% 230.213us 28.43% 230.213us 230.213us 9.985us 5.39% 9.985us 9.985us 1
aten::empty_strided 3.86% 31.250us 3.86% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.81% 217.052us 26.81% 217.052us 24.117us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.10% 17.030us 2.74% 22.170us 2.463us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.13% 9.170us 1.13% 9.170us 0.611us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 9.870us 1.22% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 9.540us 1.18% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.78% 6.320us 1.00% 8.100us 2.700us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 809.694us
Self CUDA time total: 185.314us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 352.824us 168.80% 352.824us 352.824us 1
torch_eager 14.40% 121.160us 99.40% 836.424us 836.424us 0.000us 0.00% 222.266us 222.266us 1
aten::conv1d 0.71% 5.981us 14.17% 119.243us 39.748us 0.000us 0.00% 153.724us 51.241us 3
aten::convolution 1.17% 9.810us 13.46% 113.262us 37.754us 0.000us 0.00% 153.724us 51.241us 3
aten::_convolution 2.76% 23.250us 12.29% 103.452us 34.484us 0.000us 0.00% 153.724us 51.241us 3
aten::_conv_depthwise2d 2.65% 22.340us 7.64% 64.321us 21.440us 153.724us 73.55% 153.724us 51.241us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.724us 73.55% 153.724us 51.241us 3
aten::to 0.70% 5.880us 67.58% 568.691us 94.782us 0.000us 0.00% 68.542us 11.424us 6
aten::_to_copy 2.81% 23.631us 66.88% 562.811us 93.802us 0.000us 0.00% 68.542us 11.424us 6
aten::copy_ 7.48% 62.921us 60.21% 506.640us 84.440us 55.294us 26.45% 68.542us 11.424us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.352us 15.48% 32.352us 10.784us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.942us 10.98% 22.942us 7.647us 3
Activity Buffer Request 31.88% 268.245us 31.88% 268.245us 268.245us 13.248us 6.34% 13.248us 13.248us 1
aten::empty_strided 3.87% 32.540us 3.87% 32.540us 5.423us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 23.51% 197.824us 23.51% 197.824us 21.980us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.07% 17.378us 2.68% 22.521us 2.502us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.06% 8.883us 1.06% 8.883us 0.592us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.19% 9.991us 1.19% 9.991us 3.330us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.15% 9.640us 1.15% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.990us 0.89% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 841.495us
Self CUDA time total: 209.018us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.78% 125.712us 53.74% 996.387us 996.387us 0.000us 0.00% 1.527ms 1.527ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.423ms 100.39% 1.423ms 1.423ms 1
aten::to 0.35% 6.438us 38.84% 720.182us 120.030us 0.000us 0.00% 832.992us 138.832us 6
aten::_to_copy 1.55% 28.691us 38.49% 713.744us 118.957us 0.000us 0.00% 832.992us 138.832us 6
aten::copy_ 2.90% 53.742us 26.33% 488.279us 81.380us 724.000us 51.06% 832.992us 138.832us 6
aten::conv1d 0.38% 6.960us 6.55% 121.533us 40.511us 0.000us 0.00% 693.950us 231.317us 3
aten::convolution 0.56% 10.430us 6.18% 114.573us 38.191us 0.000us 0.00% 693.950us 231.317us 3
aten::_convolution 1.25% 23.268us 5.62% 104.143us 34.714us 0.000us 0.00% 693.950us 231.317us 3
aten::_conv_depthwise2d 1.23% 22.830us 3.48% 64.552us 21.517us 693.950us 48.94% 693.950us 231.317us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 693.950us 48.94% 693.950us 231.317us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.655us 28.96% 410.655us 136.885us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 313.345us 22.10% 313.345us 104.448us 3
Activity Buffer Request 13.73% 254.654us 13.73% 254.654us 254.654us 108.992us 7.69% 108.992us 108.992us 1
aten::empty_strided 2.01% 37.271us 10.61% 196.774us 32.796us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.89% 201.884us 10.89% 201.884us 22.432us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.98% 18.223us 1.29% 23.933us 2.659us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.51% 9.490us 0.51% 9.490us 0.633us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.54% 10.101us 0.54% 10.101us 3.367us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.52% 9.620us 0.52% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.34% 6.270us 0.41% 7.680us 2.560us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.854ms
Self CUDA time total: 1.418ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.03% 122.972us 65.43% 1.999ms 1.999ms 0.000us 0.00% 1.502ms 1.502ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.41% 1.433ms 1.433ms 1
aten::to 0.19% 5.740us 56.63% 1.730ms 288.331us 0.000us 0.00% 766.432us 127.739us 6
aten::_to_copy 0.79% 24.119us 56.45% 1.724ms 287.375us 0.000us 0.00% 766.432us 127.739us 6
aten::copy_ 1.70% 52.020us 54.70% 1.671ms 278.493us 691.168us 48.43% 766.432us 127.739us 6
aten::conv1d 0.23% 6.891us 3.86% 118.002us 39.334us 0.000us 0.00% 736.031us 245.344us 3
aten::convolution 0.33% 9.930us 3.64% 111.111us 37.037us 0.000us 0.00% 736.031us 245.344us 3
aten::_convolution 0.74% 22.558us 3.31% 101.181us 33.727us 0.000us 0.00% 736.031us 245.344us 3
aten::_conv_depthwise2d 0.70% 21.291us 2.07% 63.232us 21.077us 736.031us 51.57% 736.031us 245.344us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.031us 51.57% 736.031us 245.344us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 401.120us 28.11% 401.120us 133.707us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.048us 20.32% 290.048us 96.683us 3
Activity Buffer Request 47.17% 1.441ms 47.17% 1.441ms 1.441ms 75.264us 5.27% 75.264us 75.264us 1
aten::empty_strided 0.95% 29.171us 0.95% 29.171us 4.862us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 6.58% 201.084us 6.58% 201.084us 22.343us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.57% 17.550us 0.75% 22.971us 2.552us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.30% 9.131us 0.30% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.33% 9.960us 0.33% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.30% 9.060us 0.30% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.18% 5.561us 0.23% 7.041us 2.347us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.055ms
Self CUDA time total: 1.427ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.09 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.14 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.09 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.09 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.09 True
torch_eager cuda_B4_D2048_S128_W2 0.08 True
torch_eager cuda_B4_D2048_S128_W4 0.08 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.09 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
torch_eager cuda_B4_D64_S2048_W4 0.08 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True