Running rotary benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.038ms 1165.07% 1.038ms 1.038ms 1
torch_eager 14.25% 384.344us 99.73% 2.691ms 2.691ms 0.000us 0.00% 90.272us 90.272us 1
aten::mul 6.11% 164.889us 10.39% 280.433us 11.685us 46.752us 52.50% 46.752us 1.948us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.752us 52.50% 46.752us 1.948us 24
aten::copy_ 4.15% 111.919us 62.66% 1.690ms 93.917us 29.025us 32.59% 30.240us 1.680us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.306us 25.05% 22.306us 1.859us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.91% 13.280us 1.107us 12
aten::clone 1.43% 38.559us 61.06% 1.647ms 274.577us 0.000us 0.00% 7.934us 1.322us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.719us 7.54% 6.719us 1.120us 6
aten::sub 1.59% 42.770us 2.55% 68.721us 11.454us 6.688us 7.51% 6.688us 1.115us 6
aten::add 1.63% 44.070us 2.49% 67.170us 11.195us 6.592us 7.40% 6.592us 1.099us 6
Activity Buffer Request 53.52% 1.444ms 53.52% 1.444ms 1.444ms 1.215us 1.36% 1.215us 1.215us 1
aten::empty_strided 2.14% 57.723us 2.14% 57.723us 9.620us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.62% 70.572us 2.62% 70.572us 11.762us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.99% 80.691us 3.82% 103.161us 4.298us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.83% 22.470us 0.83% 22.470us 0.936us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.47% 228.526us 8.47% 228.526us 4.761us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.27% 7.361us 0.27% 7.361us 7.361us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.698ms
Self CUDA time total: 89.057us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 960.319us 1064.55% 960.319us 960.319us 1
torch_eager 12.91% 327.841us 99.79% 2.533ms 2.533ms 0.000us 0.00% 91.361us 91.361us 1
aten::mul 6.09% 154.573us 10.36% 263.046us 10.960us 47.616us 52.78% 47.616us 1.984us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.616us 52.78% 47.616us 1.984us 24
aten::copy_ 4.38% 111.264us 65.83% 1.671ms 92.839us 29.313us 32.49% 30.465us 1.692us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.496us 24.94% 22.496us 1.875us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.72% 13.280us 1.107us 12
aten::clone 1.07% 27.110us 62.73% 1.592ms 265.408us 0.000us 0.00% 7.969us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 7.56% 6.817us 1.136us 6
aten::sub 1.66% 42.072us 2.63% 66.652us 11.109us 6.688us 7.41% 6.688us 1.115us 6
aten::add 1.28% 32.560us 2.18% 55.291us 9.215us 6.592us 7.31% 6.592us 1.099us 6
Activity Buffer Request 56.87% 1.444ms 56.87% 1.444ms 1.444ms 1.152us 1.28% 1.152us 1.152us 1
aten::empty_strided 1.25% 31.671us 1.25% 31.671us 5.278us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.12% 53.780us 2.12% 53.780us 8.963us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.76% 70.023us 3.57% 90.653us 3.777us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.81% 20.630us 0.81% 20.630us 0.860us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.59% 218.025us 8.59% 218.025us 4.542us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.289us 0.21% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.539ms
Self CUDA time total: 90.209us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 950.812us 1013.41% 950.812us 950.812us 1
torch_eager 12.58% 319.124us 99.78% 2.531ms 2.531ms 0.000us 0.00% 95.135us 95.135us 1
aten::mul 6.09% 154.550us 10.34% 262.291us 10.929us 48.671us 51.88% 48.671us 2.028us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.671us 51.88% 48.671us 2.028us 24
aten::copy_ 4.10% 104.029us 66.32% 1.682ms 93.470us 30.783us 32.81% 32.095us 1.783us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.943us 24.45% 22.943us 1.912us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.369us 15.32% 14.369us 1.197us 12
aten::clone 1.04% 26.300us 63.34% 1.607ms 267.803us 0.000us 0.00% 9.152us 1.525us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 8.36% 7.840us 1.307us 6
aten::sub 1.64% 41.492us 2.64% 66.953us 11.159us 7.199us 7.67% 7.199us 1.200us 6
aten::add 1.26% 31.999us 2.14% 54.310us 9.052us 7.170us 7.64% 7.170us 1.195us 6
Activity Buffer Request 57.64% 1.462ms 57.64% 1.462ms 1.462ms 1.312us 1.40% 1.312us 1.312us 1
aten::empty_strided 1.26% 31.840us 1.26% 31.840us 5.307us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.05% 52.102us 2.05% 52.102us 8.684us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.68% 67.986us 3.47% 87.958us 3.665us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.79% 19.972us 0.79% 19.972us 0.832us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.65% 219.475us 8.65% 219.475us 4.572us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.22% 5.651us 0.22% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.537ms
Self CUDA time total: 93.823us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 952.670us 942.15% 952.670us 952.670us 1
torch_eager 11.55% 312.506us 99.79% 2.701ms 2.701ms 0.000us 0.00% 102.429us 102.429us 1
aten::mul 5.68% 153.743us 9.71% 262.695us 10.946us 52.765us 52.18% 52.765us 2.199us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.765us 52.18% 52.765us 2.199us 24
aten::copy_ 3.97% 107.471us 68.61% 1.857ms 103.165us 32.353us 32.00% 33.665us 1.870us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.37% 24.641us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.999us 15.82% 15.999us 1.333us 12
aten::clone 1.01% 27.330us 65.76% 1.780ms 296.625us 0.000us 0.00% 9.024us 1.504us 6
aten::add 1.21% 32.850us 2.05% 55.600us 9.267us 8.032us 7.94% 8.032us 1.339us 6
aten::sub 1.44% 39.082us 2.35% 63.492us 10.582us 7.967us 7.88% 7.967us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.63% 7.712us 1.285us 6
Activity Buffer Request 52.99% 1.434ms 52.99% 1.434ms 1.434ms 1.312us 1.30% 1.312us 1.312us 1
aten::empty_strided 1.20% 32.420us 1.20% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.27% 250.924us 9.27% 250.924us 41.821us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.56% 69.212us 3.32% 89.782us 3.741us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.76% 20.570us 0.76% 20.570us 0.857us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.14% 220.374us 8.14% 220.374us 4.591us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.791us 0.21% 5.791us 5.791us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.707ms
Self CUDA time total: 101.117us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 987.399us 1051.70% 987.399us 987.399us 1
torch_eager 12.37% 335.778us 99.82% 2.710ms 2.710ms 0.000us 0.00% 95.198us 95.198us 1
aten::mul 5.74% 155.881us 9.81% 266.305us 11.096us 48.927us 52.11% 48.927us 2.039us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.927us 52.11% 48.927us 2.039us 24
aten::copy_ 3.95% 107.229us 67.43% 1.830ms 101.693us 30.753us 32.76% 32.065us 1.781us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.881us 24.37% 22.881us 1.907us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.206us 15.13% 14.206us 1.184us 12
aten::clone 0.99% 26.953us 64.69% 1.756ms 292.683us 0.000us 0.00% 9.184us 1.531us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.38% 7.872us 1.312us 6
aten::add 1.25% 33.910us 2.11% 57.361us 9.560us 7.103us 7.57% 7.103us 1.184us 6
aten::sub 1.62% 44.010us 2.55% 69.231us 11.538us 7.103us 7.57% 7.103us 1.184us 6
Activity Buffer Request 53.49% 1.452ms 53.49% 1.452ms 1.452ms 1.312us 1.40% 1.312us 1.312us 1
aten::empty_strided 1.24% 33.730us 1.24% 33.730us 5.622us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.66% 207.874us 7.66% 207.874us 34.646us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.54% 68.958us 3.31% 89.820us 3.743us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.77% 20.862us 0.77% 20.862us 0.869us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.19% 222.327us 8.19% 222.327us 4.632us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.000us 0.18% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.715ms
Self CUDA time total: 93.886us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.771us 930.81% 939.771us 939.771us 1
torch_eager 11.42% 294.218us 99.78% 2.570ms 2.570ms 0.000us 0.00% 102.276us 102.276us 1
aten::mul 5.85% 150.653us 10.08% 259.594us 10.816us 52.609us 52.11% 52.609us 2.192us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.609us 52.11% 52.609us 2.192us 24
aten::copy_ 4.01% 103.273us 68.02% 1.752ms 97.337us 32.450us 32.14% 33.763us 1.876us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 24.40% 24.640us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 15.75% 15.904us 1.325us 12
aten::clone 0.87% 22.360us 64.99% 1.674ms 278.983us 0.000us 0.00% 9.123us 1.520us 6
aten::sub 1.58% 40.669us 2.53% 65.240us 10.873us 7.968us 7.89% 7.968us 1.328us 6
aten::add 1.32% 33.930us 2.20% 56.580us 9.430us 7.936us 7.86% 7.936us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.810us 7.74% 7.810us 1.302us 6
Activity Buffer Request 54.28% 1.398ms 54.28% 1.398ms 1.398ms 1.313us 1.30% 1.313us 1.313us 1
aten::empty_strided 1.21% 31.291us 1.21% 31.291us 5.215us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.34% 188.943us 7.34% 188.943us 31.491us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.69% 69.330us 3.44% 88.671us 3.695us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.75% 19.341us 0.75% 19.341us 0.806us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.46% 218.003us 8.46% 218.003us 4.542us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.22% 5.651us 0.22% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.576ms
Self CUDA time total: 100.963us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 987.019us 820.52% 987.019us 987.019us 1
torch_eager 11.12% 293.915us 99.79% 2.637ms 2.637ms 0.000us 0.00% 122.116us 122.116us 1
aten::mul 6.22% 164.251us 10.48% 276.937us 11.539us 61.922us 51.48% 61.922us 2.580us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.922us 51.48% 61.922us 2.580us 24
aten::copy_ 3.96% 104.584us 67.08% 1.772ms 98.461us 39.265us 32.64% 41.089us 2.283us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.833us 23.97% 28.833us 2.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.105us 15.88% 19.105us 1.592us 12
aten::clone 0.81% 21.321us 64.15% 1.695ms 282.483us 0.000us 0.00% 12.256us 2.043us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.67% 10.432us 1.739us 6
aten::sub 1.58% 41.691us 2.56% 67.622us 11.270us 9.569us 7.95% 9.569us 1.595us 6
aten::add 1.31% 34.540us 2.17% 57.381us 9.563us 9.536us 7.93% 9.536us 1.589us 6
Activity Buffer Request 53.87% 1.423ms 53.87% 1.423ms 1.423ms 1.824us 1.52% 1.824us 1.824us 1
aten::empty_strided 1.17% 30.940us 1.17% 30.940us 5.157us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.97% 184.193us 6.97% 184.193us 30.699us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 3.59% 94.920us 4.40% 116.150us 4.840us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.80% 21.230us 0.80% 21.230us 0.885us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.38% 221.517us 8.38% 221.517us 4.615us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.631us 0.21% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.642ms
Self CUDA time total: 120.292us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.977us 547.62% 942.977us 942.977us 1
torch_eager 11.98% 313.186us 99.77% 2.608ms 2.608ms 0.000us 0.00% 175.043us 175.043us 1
aten::mul 5.92% 154.664us 10.07% 263.135us 10.964us 89.731us 52.11% 89.731us 3.739us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.731us 52.11% 89.731us 3.739us 24
aten::copy_ 4.21% 110.022us 67.75% 1.771ms 98.397us 57.632us 33.47% 60.480us 3.360us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.640us 23.60% 40.640us 3.387us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 14.42% 24.832us 2.069us 12
aten::clone 1.00% 26.050us 64.65% 1.690ms 281.685us 0.000us 0.00% 19.840us 3.307us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.87% 16.992us 2.832us 6
aten::add 1.22% 32.012us 2.08% 54.302us 9.050us 12.416us 7.21% 12.416us 2.069us 6
aten::sub 1.48% 38.721us 2.41% 62.881us 10.480us 12.416us 7.21% 12.416us 2.069us 6
Activity Buffer Request 54.20% 1.417ms 54.20% 1.417ms 1.417ms 2.848us 1.65% 2.848us 2.848us 1
aten::empty_strided 1.15% 30.180us 1.15% 30.180us 5.030us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.98% 182.574us 6.98% 182.574us 30.429us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.56% 66.979us 3.34% 87.351us 3.640us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.78% 20.372us 0.78% 20.372us 0.849us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.28% 216.491us 8.28% 216.491us 4.510us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.23% 5.900us 0.23% 5.900us 5.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.614ms
Self CUDA time total: 172.195us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.334us 791.88% 954.334us 954.334us 1
torch_eager 21.12% 286.823us 99.60% 1.352ms 1.352ms 0.000us 0.00% 122.339us 122.339us 1
aten::mul 11.39% 154.733us 19.43% 263.854us 10.994us 61.889us 51.35% 61.889us 2.579us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.889us 51.35% 61.889us 2.579us 24
aten::copy_ 8.06% 109.392us 38.94% 528.759us 29.376us 39.393us 32.69% 41.217us 2.290us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.95% 28.864us 2.405us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.233us 15.96% 19.233us 1.603us 12
aten::clone 1.54% 20.901us 32.67% 443.638us 73.940us 0.000us 0.00% 12.353us 2.059us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.529us 8.74% 10.529us 1.755us 6
aten::sub 2.93% 39.731us 4.81% 65.293us 10.882us 9.633us 7.99% 9.633us 1.606us 6
aten::add 2.54% 34.552us 4.77% 64.792us 10.799us 9.600us 7.97% 9.600us 1.600us 6
Activity Buffer Request 12.72% 172.763us 12.72% 172.763us 172.763us 1.824us 1.51% 1.824us 1.824us 1
aten::empty_strided 2.32% 31.561us 2.32% 31.561us 5.260us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.45% 182.623us 13.45% 182.623us 30.437us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.17% 70.140us 6.66% 90.481us 3.770us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.50% 20.341us 1.50% 20.341us 0.848us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.86% 228.904us 16.86% 228.904us 4.769us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.40% 5.490us 0.40% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.358ms
Self CUDA time total: 120.515us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 961.439us 559.06% 961.439us 961.439us 1
torch_eager 21.39% 301.083us 99.65% 1.403ms 1.403ms 0.000us 0.00% 174.821us 174.821us 1
aten::mul 10.92% 153.723us 18.79% 264.437us 11.018us 89.541us 52.07% 89.541us 3.731us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.541us 52.07% 89.541us 3.731us 24
aten::copy_ 8.57% 120.662us 41.11% 578.630us 32.146us 57.631us 33.51% 60.479us 3.360us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.639us 23.63% 40.639us 3.387us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.801us 14.42% 24.801us 2.067us 12
aten::clone 1.49% 21.022us 33.99% 478.490us 79.748us 0.000us 0.00% 19.840us 3.307us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.88% 16.992us 2.832us 6
aten::add 2.26% 31.841us 3.85% 54.131us 9.022us 12.481us 7.26% 12.481us 2.080us 6
aten::sub 2.79% 39.260us 4.52% 63.691us 10.615us 12.320us 7.16% 12.320us 2.053us 6
Activity Buffer Request 15.02% 211.404us 15.02% 211.404us 211.404us 2.848us 1.66% 2.848us 2.848us 1
aten::empty_strided 2.10% 29.500us 2.10% 29.500us 4.917us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.01% 183.184us 13.01% 183.184us 30.531us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.96% 69.812us 6.41% 90.211us 3.759us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.45% 20.399us 1.45% 20.399us 0.850us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.69% 220.815us 15.69% 220.815us 4.600us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 4.890us 0.35% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.408ms
Self CUDA time total: 171.973us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 959.740us 338.81% 959.740us 959.740us 1
torch_eager 11.78% 309.495us 99.81% 2.622ms 2.622ms 0.000us 0.00% 301.248us 301.248us 1
aten::mul 5.80% 152.430us 9.98% 262.294us 10.929us 133.378us 47.09% 133.378us 5.557us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.378us 47.09% 133.378us 5.557us 24
aten::copy_ 4.09% 107.511us 67.37% 1.770ms 98.338us 108.832us 38.42% 126.816us 7.045us 18
aten::clone 1.07% 28.041us 64.54% 1.696ms 282.603us 0.000us 0.00% 69.600us 11.600us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.216us 20.20% 57.216us 4.768us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.616us 18.22% 51.616us 8.603us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.054us 14.49% 41.054us 3.421us 12
aten::sub 1.57% 41.190us 2.52% 66.080us 11.013us 20.607us 7.27% 20.607us 3.434us 6
aten::add 1.56% 40.972us 2.46% 64.512us 10.752us 20.447us 7.22% 20.447us 3.408us 6
Activity Buffer Request 53.79% 1.413ms 53.79% 1.413ms 1.413ms 17.984us 6.35% 17.984us 17.984us 1
aten::empty_strided 1.19% 31.311us 1.19% 31.311us 5.218us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.14% 187.713us 7.14% 187.713us 31.285us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.66% 69.760us 3.44% 90.282us 3.762us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.78% 20.522us 0.78% 20.522us 0.855us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.37% 219.936us 8.37% 219.936us 4.582us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.111us 0.19% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.627ms
Self CUDA time total: 283.264us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.299us 170.17% 964.299us 964.299us 1
torch_eager 21.37% 289.253us 99.58% 1.348ms 1.348ms 0.000us 0.00% 590.419us 590.419us 1
aten::copy_ 7.69% 104.123us 37.93% 513.450us 28.525us 274.106us 48.37% 297.849us 16.547us 18
aten::mul 11.75% 159.118us 20.07% 271.705us 11.321us 226.427us 39.96% 226.427us 9.434us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 226.427us 39.96% 226.427us 9.434us 24
aten::clone 1.55% 21.020us 32.53% 440.358us 73.393us 0.000us 0.00% 206.843us 34.474us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.100us 32.31% 183.100us 30.517us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.006us 16.06% 91.006us 7.584us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.143us 11.67% 66.143us 5.512us 12
aten::sub 3.06% 41.432us 4.99% 67.562us 11.260us 33.664us 5.94% 33.664us 5.611us 6
aten::add 2.43% 32.930us 4.17% 56.451us 9.408us 32.479us 5.73% 32.479us 5.413us 6
Activity Buffer Request 11.95% 161.793us 11.95% 161.793us 161.793us 23.743us 4.19% 23.743us 23.743us 1
aten::empty_strided 2.85% 38.611us 2.85% 38.611us 6.435us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.59% 183.934us 13.59% 183.934us 30.656us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.13% 69.460us 6.64% 89.941us 3.748us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.51% 20.481us 1.51% 20.481us 0.853us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.68% 225.838us 16.68% 225.838us 4.705us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.42% 5.710us 0.42% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.354ms
Self CUDA time total: 566.676us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 940.757us 1018.68% 940.757us 940.757us 1
torch_eager 20.92% 284.932us 99.61% 1.357ms 1.357ms 0.000us 0.00% 93.503us 93.503us 1
aten::mul 11.51% 156.743us 19.57% 266.566us 11.107us 49.664us 53.78% 49.664us 2.069us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.664us 53.78% 49.664us 2.069us 24
aten::copy_ 7.76% 105.742us 39.84% 542.619us 30.146us 29.343us 31.77% 30.495us 1.694us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 24.39% 22.528us 1.877us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.344us 14.45% 13.344us 1.112us 12
aten::clone 1.52% 20.734us 33.85% 461.099us 76.850us 0.000us 0.00% 7.967us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.815us 7.38% 6.815us 1.136us 6
aten::sub 2.96% 40.252us 4.79% 65.263us 10.877us 6.688us 7.24% 6.688us 1.115us 6
aten::add 2.34% 31.811us 3.99% 54.311us 9.052us 6.656us 7.21% 6.656us 1.109us 6
Activity Buffer Request 14.09% 191.853us 14.09% 191.853us 191.853us 1.152us 1.25% 1.152us 1.152us 1
aten::empty_strided 2.30% 31.379us 2.30% 31.379us 5.230us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.46% 183.403us 13.46% 183.403us 30.567us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.20% 70.859us 6.67% 90.910us 3.788us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.47% 20.051us 1.47% 20.051us 0.835us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.08% 218.955us 16.08% 218.955us 4.562us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.39% 5.360us 0.39% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.362ms
Self CUDA time total: 92.351us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.706us 986.10% 945.706us 945.706us 1
torch_eager 12.18% 322.968us 99.79% 2.647ms 2.647ms 0.000us 0.00% 97.216us 97.216us 1
aten::mul 5.85% 155.091us 9.99% 264.924us 11.039us 50.947us 53.12% 50.947us 2.123us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 50.947us 53.12% 50.947us 2.123us 24
aten::copy_ 3.92% 103.931us 67.30% 1.785ms 99.174us 30.783us 32.10% 32.095us 1.783us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 23.96% 22.976us 1.915us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.174us 14.78% 14.174us 1.181us 12
aten::clone 1.18% 31.280us 64.70% 1.716ms 286.035us 0.000us 0.00% 9.119us 1.520us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 8.14% 7.807us 1.301us 6
aten::add 1.22% 32.380us 2.09% 55.311us 9.219us 7.102us 7.41% 7.102us 1.184us 6
aten::sub 1.50% 39.882us 2.41% 63.892us 10.649us 7.072us 7.37% 7.072us 1.179us 6
Activity Buffer Request 53.95% 1.431ms 53.95% 1.431ms 1.431ms 1.312us 1.37% 1.312us 1.312us 1
aten::empty_strided 1.23% 32.600us 1.23% 32.600us 5.433us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.05% 187.002us 7.05% 187.002us 31.167us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.63% 69.642us 3.43% 90.901us 3.788us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.80% 21.259us 0.80% 21.259us 0.886us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.29% 220.006us 8.29% 220.006us 4.583us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.569us 0.21% 5.569us 5.569us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.653ms
Self CUDA time total: 95.904us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 963.956us 929.78% 963.956us 963.956us 1
torch_eager 11.95% 315.942us 99.78% 2.637ms 2.637ms 0.000us 0.00% 104.988us 104.988us 1
aten::mul 6.01% 158.721us 10.21% 269.951us 11.248us 55.295us 53.33% 55.295us 2.304us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.295us 53.33% 55.295us 2.304us 24
aten::copy_ 4.03% 106.403us 67.45% 1.783ms 99.031us 32.417us 31.27% 33.729us 1.874us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.607us 23.73% 24.607us 2.051us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.964us 15.40% 15.964us 1.330us 12
aten::clone 1.02% 26.870us 64.62% 1.708ms 284.615us 0.000us 0.00% 9.122us 1.520us 6
aten::add 1.23% 32.629us 2.10% 55.390us 9.232us 7.997us 7.71% 7.997us 1.333us 6
aten::sub 1.44% 38.041us 2.36% 62.260us 10.377us 7.967us 7.68% 7.967us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.810us 7.53% 7.810us 1.302us 6
Activity Buffer Request 54.08% 1.429ms 54.08% 1.429ms 1.429ms 1.312us 1.27% 1.312us 1.312us 1
aten::empty_strided 1.27% 33.640us 1.27% 33.640us 5.607us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.95% 183.544us 6.95% 183.544us 30.591us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.64% 69.789us 3.42% 90.471us 3.770us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.78% 20.682us 0.78% 20.682us 0.862us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.39% 221.610us 8.39% 221.610us 4.617us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.22% 5.700us 0.22% 5.700us 5.700us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.643ms
Self CUDA time total: 103.676us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 933.942us 757.68% 933.942us 933.942us 1
torch_eager 21.17% 287.829us 99.59% 1.354ms 1.354ms 0.000us 0.00% 125.024us 125.024us 1
aten::mul 11.38% 154.770us 19.33% 262.774us 10.949us 64.862us 52.62% 64.862us 2.703us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 64.862us 52.62% 64.862us 2.703us 24
aten::copy_ 7.76% 105.560us 40.17% 546.058us 30.337us 39.265us 31.85% 41.025us 2.279us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.865us 23.42% 28.865us 2.405us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.137us 15.53% 19.137us 1.595us 12
aten::clone 1.51% 20.520us 34.08% 463.317us 77.220us 0.000us 0.00% 12.160us 2.027us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.44% 10.400us 1.733us 6
aten::sub 2.90% 39.471us 4.67% 63.511us 10.585us 9.569us 7.76% 9.569us 1.595us 6
aten::add 2.50% 34.030us 4.22% 57.431us 9.572us 9.568us 7.76% 9.568us 1.595us 6
Activity Buffer Request 14.30% 194.363us 14.30% 194.363us 194.363us 1.760us 1.43% 1.760us 1.760us 1
aten::empty_strided 2.23% 30.321us 2.23% 30.321us 5.053us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.45% 182.914us 13.45% 182.914us 30.486us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.84% 65.748us 6.29% 85.480us 3.562us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.45% 19.732us 1.45% 19.732us 0.822us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.08% 218.666us 16.08% 218.666us 4.556us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.41% 5.560us 0.41% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.359ms
Self CUDA time total: 123.264us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.359us 900.66% 934.359us 934.359us 1
torch_eager 21.17% 286.322us 99.59% 1.347ms 1.347ms 0.000us 0.00% 105.086us 105.086us 1
aten::mul 11.62% 157.214us 19.66% 265.945us 11.081us 55.327us 53.33% 55.327us 2.305us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.327us 53.33% 55.327us 2.305us 24
aten::copy_ 7.65% 103.495us 39.66% 536.482us 29.805us 32.511us 31.34% 33.855us 1.881us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 23.81% 24.704us 2.059us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 15.33% 15.904us 1.325us 12
aten::clone 1.57% 21.280us 33.91% 458.650us 76.442us 0.000us 0.00% 9.151us 1.525us 6
aten::add 2.43% 32.883us 4.09% 55.372us 9.229us 8.001us 7.71% 8.001us 1.333us 6
aten::sub 2.87% 38.810us 4.64% 62.781us 10.463us 7.903us 7.62% 7.903us 1.317us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 7.53% 7.807us 1.301us 6
Activity Buffer Request 14.06% 190.184us 14.06% 190.184us 190.184us 1.344us 1.30% 1.344us 1.344us 1
aten::empty_strided 2.22% 30.070us 2.22% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.39% 181.103us 13.39% 181.103us 30.184us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.05% 68.302us 6.56% 88.771us 3.699us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.51% 20.469us 1.51% 20.469us 0.853us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.03% 216.891us 16.03% 216.891us 4.519us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.41% 5.591us 0.41% 5.591us 5.591us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.353ms
Self CUDA time total: 103.742us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.694us 764.03% 944.694us 944.694us 1
torch_eager 20.48% 287.824us 99.60% 1.400ms 1.400ms 0.000us 0.00% 125.438us 125.438us 1
aten::mul 10.91% 153.363us 18.83% 264.625us 11.026us 65.151us 52.69% 65.151us 2.715us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.151us 52.69% 65.151us 2.715us 24
aten::copy_ 7.88% 110.793us 41.73% 586.532us 32.585us 39.328us 31.81% 41.120us 2.284us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.895us 23.37% 28.895us 2.408us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.167us 15.50% 19.167us 1.597us 12
aten::clone 1.52% 21.310us 35.87% 504.089us 84.015us 0.000us 0.00% 12.225us 2.038us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.433us 8.44% 10.433us 1.739us 6
aten::sub 2.80% 39.332us 4.57% 64.213us 10.702us 9.632us 7.79% 9.632us 1.605us 6
aten::add 2.33% 32.799us 3.97% 55.790us 9.298us 9.535us 7.71% 9.535us 1.589us 6
Activity Buffer Request 15.08% 211.984us 15.08% 211.984us 211.984us 1.792us 1.45% 1.792us 1.792us 1
aten::empty_strided 2.18% 30.690us 2.18% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.35% 201.734us 14.35% 201.734us 33.622us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.89% 68.724us 6.32% 88.851us 3.702us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.43% 20.127us 1.43% 20.127us 0.839us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.74% 221.155us 15.74% 221.155us 4.607us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.40% 5.570us 0.40% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.405ms
Self CUDA time total: 123.646us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.077us 529.63% 938.077us 938.077us 1
torch_eager 22.00% 288.844us 99.57% 1.307ms 1.307ms 0.000us 0.00% 179.967us 179.967us 1
aten::mul 11.92% 156.562us 20.13% 264.245us 11.010us 94.881us 53.57% 94.881us 3.953us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.881us 53.57% 94.881us 3.953us 24
aten::copy_ 8.04% 105.524us 37.72% 495.290us 27.516us 57.663us 32.56% 60.511us 3.362us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.672us 22.96% 40.672us 3.389us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.575us 13.87% 24.575us 2.048us 12
aten::clone 1.60% 21.071us 31.51% 413.758us 68.960us 0.000us 0.00% 19.839us 3.306us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.991us 9.59% 16.991us 2.832us 6
aten::add 2.42% 31.800us 4.16% 54.561us 9.093us 12.288us 6.94% 12.288us 2.048us 6
aten::sub 3.05% 40.090us 5.01% 65.752us 10.959us 12.287us 6.94% 12.287us 2.048us 6
Activity Buffer Request 10.75% 141.113us 10.75% 141.113us 141.113us 2.848us 1.61% 2.848us 2.848us 1
aten::empty_strided 2.28% 29.940us 2.28% 29.940us 4.990us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.21% 186.543us 14.21% 186.543us 31.091us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.18% 67.990us 6.68% 87.660us 3.652us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.50% 19.670us 1.50% 19.670us 0.820us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.62% 218.216us 16.62% 218.216us 4.546us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.43% 5.650us 0.43% 5.650us 5.650us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.313ms
Self CUDA time total: 177.119us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.076us 318.26% 945.076us 945.076us 1
torch_eager 21.55% 289.808us 99.58% 1.339ms 1.339ms 0.000us 0.00% 314.171us 314.171us 1
aten::mul 11.43% 153.633us 19.62% 263.817us 10.992us 145.952us 49.15% 145.952us 6.081us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.952us 49.15% 145.952us 6.081us 24
aten::copy_ 9.11% 122.489us 38.99% 524.297us 29.128us 110.173us 37.10% 127.389us 7.077us 18
aten::clone 1.65% 22.169us 33.13% 445.468us 74.245us 0.000us 0.00% 70.110us 11.685us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.279us 19.29% 57.279us 4.773us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.894us 17.81% 52.894us 8.816us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.830us 13.75% 40.830us 3.402us 12
aten::sub 2.94% 39.549us 4.81% 64.690us 10.782us 20.511us 6.91% 20.511us 3.418us 6
aten::add 2.41% 32.411us 4.09% 55.020us 9.170us 20.319us 6.84% 20.319us 3.386us 6
Activity Buffer Request 11.32% 152.193us 11.32% 152.193us 152.193us 17.216us 5.80% 17.216us 17.216us 1
aten::empty_strided 2.31% 31.082us 2.31% 31.082us 5.180us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.88% 186.593us 13.88% 186.593us 31.099us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.09% 68.450us 6.56% 88.160us 3.673us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.47% 19.710us 1.47% 19.710us 0.821us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.43% 220.956us 16.43% 220.956us 4.603us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.42% 5.661us 0.42% 5.661us 5.661us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.345ms
Self CUDA time total: 296.955us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 986.080us 556.73% 986.080us 986.080us 1
torch_eager 12.52% 336.567us 99.81% 2.683ms 2.683ms 0.000us 0.00% 179.999us 179.999us 1
aten::mul 5.82% 156.365us 9.99% 268.575us 11.191us 94.976us 53.62% 94.976us 3.957us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.976us 53.62% 94.976us 3.957us 24
aten::copy_ 3.98% 106.939us 67.04% 1.802ms 100.094us 57.535us 32.48% 60.415us 3.356us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 22.98% 40.703us 3.392us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.608us 13.89% 24.608us 2.051us 12
aten::clone 1.08% 29.091us 64.22% 1.726ms 287.668us 0.000us 0.00% 19.712us 3.285us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 9.50% 16.832us 2.805us 6
aten::add 1.21% 32.499us 2.06% 55.240us 9.207us 12.320us 6.96% 12.320us 2.053us 6
aten::sub 1.59% 42.650us 2.57% 69.041us 11.507us 12.288us 6.94% 12.288us 2.048us 6
Activity Buffer Request 53.52% 1.438ms 53.52% 1.438ms 1.438ms 2.880us 1.63% 2.880us 2.880us 1
aten::empty_strided 1.16% 31.221us 1.16% 31.221us 5.204us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.20% 193.473us 7.20% 193.473us 32.245us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.61% 70.195us 3.39% 91.232us 3.801us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.78% 21.037us 0.78% 21.037us 0.877us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.35% 224.324us 8.35% 224.324us 4.673us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 4.980us 0.19% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.688ms
Self CUDA time total: 177.119us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 955.007us 321.87% 955.007us 955.007us 1
torch_eager 21.61% 290.382us 99.58% 1.338ms 1.338ms 0.000us 0.00% 314.050us 314.050us 1
aten::mul 12.35% 165.965us 20.49% 275.388us 11.475us 146.274us 49.30% 146.274us 6.095us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.274us 49.30% 146.274us 6.095us 24
aten::copy_ 7.99% 107.375us 38.18% 513.111us 28.506us 109.984us 37.07% 127.328us 7.074us 18
aten::clone 1.53% 20.570us 31.98% 429.868us 71.645us 0.000us 0.00% 70.048us 11.675us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.280us 19.31% 57.280us 4.773us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.704us 17.76% 52.704us 8.784us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 13.63% 40.448us 3.371us 12
aten::sub 2.99% 40.150us 4.79% 64.400us 10.733us 20.288us 6.84% 20.288us 3.381us 6
aten::add 2.45% 32.907us 4.13% 55.499us 9.250us 20.160us 6.79% 20.160us 3.360us 6
Activity Buffer Request 11.77% 158.223us 11.77% 158.223us 158.223us 17.344us 5.85% 17.344us 17.344us 1
aten::empty_strided 2.28% 30.711us 2.28% 30.711us 5.118us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.78% 185.224us 13.78% 185.224us 30.871us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.13% 68.942us 6.58% 88.372us 3.682us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.45% 19.430us 1.45% 19.430us 0.810us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.26% 218.554us 16.26% 218.554us 4.553us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.42% 5.611us 0.42% 5.611us 5.611us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.344ms
Self CUDA time total: 296.706us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 962.939us 164.48% 962.939us 962.939us 1
torch_eager 21.30% 292.019us 99.59% 1.365ms 1.365ms 0.000us 0.00% 609.117us 609.117us 1
aten::copy_ 7.59% 104.052us 39.10% 536.059us 29.781us 268.735us 45.90% 292.415us 16.245us 18
aten::mul 11.61% 159.130us 19.77% 271.083us 11.295us 251.454us 42.95% 251.454us 10.477us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.454us 42.95% 251.454us 10.477us 24
aten::clone 1.60% 21.919us 33.19% 455.067us 75.844us 0.000us 0.00% 201.504us 33.584us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.824us 30.37% 177.824us 29.637us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.911us 15.53% 90.911us 7.576us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.248us 11.15% 65.248us 5.437us 12
aten::sub 2.98% 40.869us 4.94% 67.700us 11.283us 32.703us 5.59% 32.703us 5.451us 6
aten::add 2.40% 32.850us 4.07% 55.841us 9.307us 32.545us 5.56% 32.545us 5.424us 6
Activity Buffer Request 13.18% 180.724us 13.18% 180.724us 180.724us 23.680us 4.04% 23.680us 23.680us 1
aten::empty_strided 2.23% 30.541us 2.23% 30.541us 5.090us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.71% 188.023us 13.71% 188.023us 31.337us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 5.13% 70.322us 6.59% 90.292us 3.762us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.46% 19.970us 1.46% 19.970us 0.832us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.41% 225.035us 16.41% 225.035us 4.688us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.41% 5.640us 0.41% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.371ms
Self CUDA time total: 585.437us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 9.18% 318.848us 77.56% 2.693ms 2.693ms 0.000us 0.00% 1.840ms 1.840ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.811ms 102.06% 1.811ms 1.811ms 1
aten::copy_ 3.19% 110.682us 53.02% 1.841ms 102.257us 792.737us 44.68% 858.369us 47.687us 18
aten::mul 4.39% 152.554us 7.57% 262.845us 10.952us 833.316us 46.97% 833.316us 34.721us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 833.316us 46.97% 833.316us 34.721us 24
aten::clone 0.79% 27.538us 50.82% 1.764ms 294.050us 0.000us 0.00% 624.865us 104.144us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 559.233us 31.52% 559.233us 93.206us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.504us 13.16% 233.504us 19.459us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 148.032us 8.34% 148.032us 12.336us 12
aten::sub 1.13% 39.132us 1.88% 65.111us 10.852us 90.112us 5.08% 90.112us 15.019us 6
Activity Buffer Request 41.37% 1.436ms 41.37% 1.436ms 1.436ms 65.632us 3.70% 65.632us 65.632us 1
aten::add 0.97% 33.650us 1.61% 56.062us 9.344us 57.920us 3.26% 57.920us 9.653us 6
aten::empty_strided 0.92% 31.941us 0.92% 31.941us 5.324us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.62% 229.834us 6.62% 229.834us 38.306us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.00% 69.363us 2.59% 89.831us 3.743us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.59% 20.468us 0.59% 20.468us 0.853us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 6.41% 222.613us 6.41% 222.613us 4.638us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 22.44% 778.913us 22.44% 778.913us 778.913us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.472ms
Self CUDA time total: 1.774ms
impl wl p50(ms) ok
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True