Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 179.327us 1411.47% 179.327us 179.327us 1
torch_eager 11.22% 210.364us 99.57% 1.867ms 1.867ms 0.000us 0.00% 15.009us 15.009us 1
aten::silu 3.37% 63.151us 82.30% 1.543ms 514.355us 6.497us 51.14% 8.801us 2.934us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.14% 6.497us 2.166us 3
aten::mul 1.76% 33.030us 2.90% 54.310us 18.103us 6.208us 48.86% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.86% 6.208us 2.069us 3
Activity Buffer Request 76.72% 1.439ms 76.72% 1.439ms 1.439ms 2.304us 18.13% 2.304us 2.304us 1
aten::slice 2.52% 47.241us 3.15% 59.052us 9.842us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.63% 11.811us 0.63% 11.811us 1.968us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.34% 62.690us 3.34% 62.690us 10.448us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.43% 8.120us 0.43% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.875ms
Self CUDA time total: 12.705us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.777us 1228.76% 151.777us 151.777us 1
torch_eager 6.62% 113.831us 99.66% 1.713ms 1.713ms 0.000us 0.00% 14.496us 14.496us 1
aten::silu 2.46% 42.260us 88.64% 1.523ms 507.722us 6.368us 51.55% 8.512us 2.837us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 51.55% 6.368us 2.123us 3
aten::mul 1.53% 26.241us 2.60% 44.713us 14.904us 5.984us 48.45% 5.984us 1.995us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.45% 5.984us 1.995us 3
Activity Buffer Request 84.63% 1.454ms 84.63% 1.454ms 1.454ms 2.144us 17.36% 2.144us 2.144us 1
aten::slice 1.45% 24.880us 1.80% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.35% 6.040us 0.35% 6.040us 1.007us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.62% 45.062us 2.62% 45.062us 7.510us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.34% 5.800us 0.34% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.718ms
Self CUDA time total: 12.352us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.422us 1145.66% 151.422us 151.422us 1
torch_eager 6.39% 108.591us 99.69% 1.694ms 1.694ms 0.000us 0.00% 15.489us 15.489us 1
aten::silu 2.42% 41.180us 88.84% 1.509ms 503.045us 6.784us 51.33% 9.056us 3.019us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
aten::mul 1.56% 26.573us 2.72% 46.263us 15.421us 6.433us 48.67% 6.433us 2.144us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.67% 6.433us 2.144us 3
Activity Buffer Request 84.90% 1.442ms 84.90% 1.442ms 1.442ms 2.272us 17.19% 2.272us 2.272us 1
aten::slice 1.42% 24.110us 1.74% 29.570us 4.928us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.32% 5.460us 0.32% 5.460us 0.910us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.67% 45.420us 2.67% 45.420us 7.570us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.31% 5.240us 0.31% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.699ms
Self CUDA time total: 13.217us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.159us 1197.73% 152.159us 152.159us 1
torch_eager 7.49% 109.251us 99.65% 1.454ms 1.454ms 0.000us 0.00% 14.912us 14.912us 1
aten::silu 2.87% 41.871us 86.91% 1.268ms 422.724us 6.560us 51.64% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
aten::mul 1.82% 26.542us 3.09% 45.132us 15.044us 6.144us 48.36% 6.144us 2.048us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
Activity Buffer Request 71.19% 1.039ms 71.19% 1.039ms 1.039ms 2.208us 17.38% 2.208us 2.208us 1
aten::slice 1.75% 25.480us 2.16% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.42% 6.080us 0.42% 6.080us 1.013us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 14.12% 206.043us 14.12% 206.043us 34.340us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.35% 5.050us 0.35% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.459ms
Self CUDA time total: 12.704us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.295us 1117.14% 147.295us 147.295us 1
torch_eager 5.91% 105.630us 99.72% 1.782ms 1.782ms 0.000us 0.00% 15.457us 15.457us 1
aten::silu 2.35% 41.900us 89.64% 1.602ms 533.846us 6.752us 51.21% 9.024us 3.008us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.21% 6.752us 2.251us 3
aten::mul 1.43% 25.502us 2.46% 43.882us 14.627us 6.433us 48.79% 6.433us 2.144us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.79% 6.433us 2.144us 3
Activity Buffer Request 78.53% 1.403ms 78.53% 1.403ms 1.403ms 2.272us 17.23% 2.272us 2.272us 1
aten::slice 1.39% 24.781us 1.71% 30.582us 5.097us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.32% 5.801us 0.32% 5.801us 0.967us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.80% 175.053us 9.80% 175.053us 29.176us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 4.969us 0.28% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.787ms
Self CUDA time total: 13.185us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.964us 937.33% 143.964us 143.964us 1
torch_eager 21.41% 103.402us 98.95% 477.918us 477.918us 0.000us 0.00% 18.047us 18.047us 1
aten::silu 9.04% 43.640us 62.61% 302.394us 100.798us 7.872us 51.25% 10.560us 3.520us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.25% 7.872us 2.624us 3
aten::mul 5.13% 24.761us 8.85% 42.722us 14.241us 7.487us 48.75% 7.487us 2.496us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.487us 48.75% 7.487us 2.496us 3
Activity Buffer Request 22.09% 106.692us 22.09% 106.692us 106.692us 2.688us 17.50% 2.688us 2.688us 1
aten::slice 4.94% 23.880us 6.09% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.14% 5.520us 1.14% 5.520us 0.920us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 35.20% 170.023us 35.20% 170.023us 28.337us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 1.05% 5.060us 1.05% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 482.978us
Self CUDA time total: 15.359us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.301us 1078.65% 154.301us 154.301us 1
torch_eager 5.96% 107.399us 99.74% 1.796ms 1.796ms 0.000us 0.00% 16.769us 16.769us 1
aten::silu 2.38% 42.931us 89.51% 1.612ms 537.266us 7.328us 51.23% 9.792us 3.264us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
aten::mul 1.49% 26.893us 2.55% 45.883us 15.294us 6.977us 48.77% 6.977us 2.326us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 48.77% 6.977us 2.326us 3
Activity Buffer Request 78.67% 1.417ms 78.67% 1.417ms 1.417ms 2.464us 17.22% 2.464us 2.464us 1
aten::slice 1.40% 25.140us 1.72% 31.031us 5.172us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.33% 5.891us 0.33% 5.891us 0.982us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.51% 171.283us 9.51% 171.283us 28.547us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.600us 0.26% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.801ms
Self CUDA time total: 14.305us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 1002.89% 154.686us 154.686us 1
torch_eager 22.31% 107.382us 99.03% 476.668us 476.668us 0.000us 0.00% 18.080us 18.080us 1
aten::silu 9.43% 45.390us 60.13% 289.404us 96.468us 7.872us 51.04% 10.528us 3.509us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.04% 7.872us 2.624us 3
aten::mul 6.54% 31.461us 10.39% 50.022us 16.674us 7.552us 48.96% 7.552us 2.517us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.96% 7.552us 2.517us 3
Activity Buffer Request 19.41% 93.401us 19.41% 93.401us 93.401us 2.656us 17.22% 2.656us 2.656us 1
aten::slice 5.01% 24.090us 6.20% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.20% 5.770us 1.20% 5.770us 0.962us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 35.15% 169.174us 35.15% 169.174us 28.196us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.97% 4.650us 0.97% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 481.318us
Self CUDA time total: 15.424us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.678us 692.09% 155.678us 155.678us 1
torch_eager 6.04% 109.222us 99.73% 1.805ms 1.805ms 0.000us 0.00% 26.365us 26.365us 1
aten::silu 2.28% 41.351us 89.49% 1.620ms 539.866us 11.614us 51.63% 15.485us 5.162us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.614us 51.63% 11.614us 3.871us 3
aten::mul 1.47% 26.681us 2.47% 44.641us 14.880us 10.880us 48.37% 10.880us 3.627us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.880us 48.37% 10.880us 3.627us 3
Activity Buffer Request 78.73% 1.425ms 78.73% 1.425ms 1.425ms 3.871us 17.21% 3.871us 3.871us 1
aten::slice 1.39% 25.188us 1.73% 31.390us 5.232us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.34% 6.202us 0.34% 6.202us 1.034us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.47% 171.352us 9.47% 171.352us 28.559us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.810ms
Self CUDA time total: 22.494us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True