PyTorch Native - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.21s | Raw GitHub
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Thu Oct 30 15:52:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   30C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark (PyTorch Native)

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 3.86s | Raw GitHub
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark


def apply_rotary_torch(x1, x2, cos, sin, conj=False):
    """Reference rotary implementation."""
    if not conj:
        out1 = x1 * cos - x2 * sin
        out2 = x1 * sin + x2 * cos
    else:
        out1 = x1 * cos + x2 * sin
        out2 = -x1 * sin + x2 * cos
    return out1, out2


def torch_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone inputs to avoid modifying them
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
    q_out[..., :rotary_dim] = q_out_1
    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
    k_out[..., :rotary_dim] = k_out_1
    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="torch_eager",
    impl_tags={"family": "pytorch", "backend": "eager"},
    impl_func=torch_rotary,
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.038ms      1165.07%       1.038ms       1.038ms             1  
                                            torch_eager        14.25%     384.344us        99.73%       2.691ms       2.691ms       0.000us         0.00%      90.272us      90.272us             1  
                                              aten::mul         6.11%     164.889us        10.39%     280.433us      11.685us      46.752us        52.50%      46.752us       1.948us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.752us        52.50%      46.752us       1.948us            24  
                                            aten::copy_         4.15%     111.919us        62.66%       1.690ms      93.917us      29.025us        32.59%      30.240us       1.680us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.306us        25.05%      22.306us       1.859us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.91%      13.280us       1.107us            12  
                                            aten::clone         1.43%      38.559us        61.06%       1.647ms     274.577us       0.000us         0.00%       7.934us       1.322us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.719us         7.54%       6.719us       1.120us             6  
                                              aten::sub         1.59%      42.770us         2.55%      68.721us      11.454us       6.688us         7.51%       6.688us       1.115us             6  
                                              aten::add         1.63%      44.070us         2.49%      67.170us      11.195us       6.592us         7.40%       6.592us       1.099us             6  
                                Activity Buffer Request        53.52%       1.444ms        53.52%       1.444ms       1.444ms       1.215us         1.36%       1.215us       1.215us             1  
                                    aten::empty_strided         2.14%      57.723us         2.14%      57.723us       9.620us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.62%      70.572us         2.62%      70.572us      11.762us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.99%      80.691us         3.82%     103.161us       4.298us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.83%      22.470us         0.83%      22.470us       0.936us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.47%     228.526us         8.47%     228.526us       4.761us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.27%       7.361us         0.27%       7.361us       7.361us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.698ms
Self CUDA time total: 89.057us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     960.319us      1064.55%     960.319us     960.319us             1  
                                            torch_eager        12.91%     327.841us        99.79%       2.533ms       2.533ms       0.000us         0.00%      91.361us      91.361us             1  
                                              aten::mul         6.09%     154.573us        10.36%     263.046us      10.960us      47.616us        52.78%      47.616us       1.984us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.616us        52.78%      47.616us       1.984us            24  
                                            aten::copy_         4.38%     111.264us        65.83%       1.671ms      92.839us      29.313us        32.49%      30.465us       1.692us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.496us        24.94%      22.496us       1.875us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.72%      13.280us       1.107us            12  
                                            aten::clone         1.07%      27.110us        62.73%       1.592ms     265.408us       0.000us         0.00%       7.969us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us         7.56%       6.817us       1.136us             6  
                                              aten::sub         1.66%      42.072us         2.63%      66.652us      11.109us       6.688us         7.41%       6.688us       1.115us             6  
                                              aten::add         1.28%      32.560us         2.18%      55.291us       9.215us       6.592us         7.31%       6.592us       1.099us             6  
                                Activity Buffer Request        56.87%       1.444ms        56.87%       1.444ms       1.444ms       1.152us         1.28%       1.152us       1.152us             1  
                                    aten::empty_strided         1.25%      31.671us         1.25%      31.671us       5.278us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.12%      53.780us         2.12%      53.780us       8.963us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.76%      70.023us         3.57%      90.653us       3.777us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.81%      20.630us         0.81%      20.630us       0.860us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.59%     218.025us         8.59%     218.025us       4.542us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.289us         0.21%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.539ms
Self CUDA time total: 90.209us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.812us      1013.41%     950.812us     950.812us             1  
                                            torch_eager        12.58%     319.124us        99.78%       2.531ms       2.531ms       0.000us         0.00%      95.135us      95.135us             1  
                                              aten::mul         6.09%     154.550us        10.34%     262.291us      10.929us      48.671us        51.88%      48.671us       2.028us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.671us        51.88%      48.671us       2.028us            24  
                                            aten::copy_         4.10%     104.029us        66.32%       1.682ms      93.470us      30.783us        32.81%      32.095us       1.783us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.943us        24.45%      22.943us       1.912us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.369us        15.32%      14.369us       1.197us            12  
                                            aten::clone         1.04%      26.300us        63.34%       1.607ms     267.803us       0.000us         0.00%       9.152us       1.525us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us         8.36%       7.840us       1.307us             6  
                                              aten::sub         1.64%      41.492us         2.64%      66.953us      11.159us       7.199us         7.67%       7.199us       1.200us             6  
                                              aten::add         1.26%      31.999us         2.14%      54.310us       9.052us       7.170us         7.64%       7.170us       1.195us             6  
                                Activity Buffer Request        57.64%       1.462ms        57.64%       1.462ms       1.462ms       1.312us         1.40%       1.312us       1.312us             1  
                                    aten::empty_strided         1.26%      31.840us         1.26%      31.840us       5.307us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.05%      52.102us         2.05%      52.102us       8.684us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.68%      67.986us         3.47%      87.958us       3.665us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.79%      19.972us         0.79%      19.972us       0.832us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.65%     219.475us         8.65%     219.475us       4.572us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.22%       5.651us         0.22%       5.651us       5.651us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.537ms
Self CUDA time total: 93.823us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     952.670us       942.15%     952.670us     952.670us             1  
                                            torch_eager        11.55%     312.506us        99.79%       2.701ms       2.701ms       0.000us         0.00%     102.429us     102.429us             1  
                                              aten::mul         5.68%     153.743us         9.71%     262.695us      10.946us      52.765us        52.18%      52.765us       2.199us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.765us        52.18%      52.765us       2.199us            24  
                                            aten::copy_         3.97%     107.471us        68.61%       1.857ms     103.165us      32.353us        32.00%      33.665us       1.870us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.37%      24.641us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.999us        15.82%      15.999us       1.333us            12  
                                            aten::clone         1.01%      27.330us        65.76%       1.780ms     296.625us       0.000us         0.00%       9.024us       1.504us             6  
                                              aten::add         1.21%      32.850us         2.05%      55.600us       9.267us       8.032us         7.94%       8.032us       1.339us             6  
                                              aten::sub         1.44%      39.082us         2.35%      63.492us      10.582us       7.967us         7.88%       7.967us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.63%       7.712us       1.285us             6  
                                Activity Buffer Request        52.99%       1.434ms        52.99%       1.434ms       1.434ms       1.312us         1.30%       1.312us       1.312us             1  
                                    aten::empty_strided         1.20%      32.420us         1.20%      32.420us       5.403us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.27%     250.924us         9.27%     250.924us      41.821us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.56%      69.212us         3.32%      89.782us       3.741us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.76%      20.570us         0.76%      20.570us       0.857us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.14%     220.374us         8.14%     220.374us       4.591us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.791us         0.21%       5.791us       5.791us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.707ms
Self CUDA time total: 101.117us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     987.399us      1051.70%     987.399us     987.399us             1  
                                            torch_eager        12.37%     335.778us        99.82%       2.710ms       2.710ms       0.000us         0.00%      95.198us      95.198us             1  
                                              aten::mul         5.74%     155.881us         9.81%     266.305us      11.096us      48.927us        52.11%      48.927us       2.039us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.927us        52.11%      48.927us       2.039us            24  
                                            aten::copy_         3.95%     107.229us        67.43%       1.830ms     101.693us      30.753us        32.76%      32.065us       1.781us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.881us        24.37%      22.881us       1.907us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.206us        15.13%      14.206us       1.184us            12  
                                            aten::clone         0.99%      26.953us        64.69%       1.756ms     292.683us       0.000us         0.00%       9.184us       1.531us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
                                              aten::add         1.25%      33.910us         2.11%      57.361us       9.560us       7.103us         7.57%       7.103us       1.184us             6  
                                              aten::sub         1.62%      44.010us         2.55%      69.231us      11.538us       7.103us         7.57%       7.103us       1.184us             6  
                                Activity Buffer Request        53.49%       1.452ms        53.49%       1.452ms       1.452ms       1.312us         1.40%       1.312us       1.312us             1  
                                    aten::empty_strided         1.24%      33.730us         1.24%      33.730us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.66%     207.874us         7.66%     207.874us      34.646us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.54%      68.958us         3.31%      89.820us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.77%      20.862us         0.77%      20.862us       0.869us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.19%     222.327us         8.19%     222.327us       4.632us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.000us         0.18%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.715ms
Self CUDA time total: 93.886us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.771us       930.81%     939.771us     939.771us             1  
                                            torch_eager        11.42%     294.218us        99.78%       2.570ms       2.570ms       0.000us         0.00%     102.276us     102.276us             1  
                                              aten::mul         5.85%     150.653us        10.08%     259.594us      10.816us      52.609us        52.11%      52.609us       2.192us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.609us        52.11%      52.609us       2.192us            24  
                                            aten::copy_         4.01%     103.273us        68.02%       1.752ms      97.337us      32.450us        32.14%      33.763us       1.876us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        24.40%      24.640us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.904us        15.75%      15.904us       1.325us            12  
                                            aten::clone         0.87%      22.360us        64.99%       1.674ms     278.983us       0.000us         0.00%       9.123us       1.520us             6  
                                              aten::sub         1.58%      40.669us         2.53%      65.240us      10.873us       7.968us         7.89%       7.968us       1.328us             6  
                                              aten::add         1.32%      33.930us         2.20%      56.580us       9.430us       7.936us         7.86%       7.936us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.810us         7.74%       7.810us       1.302us             6  
                                Activity Buffer Request        54.28%       1.398ms        54.28%       1.398ms       1.398ms       1.313us         1.30%       1.313us       1.313us             1  
                                    aten::empty_strided         1.21%      31.291us         1.21%      31.291us       5.215us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.34%     188.943us         7.34%     188.943us      31.491us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.69%      69.330us         3.44%      88.671us       3.695us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.75%      19.341us         0.75%      19.341us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.46%     218.003us         8.46%     218.003us       4.542us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.22%       5.651us         0.22%       5.651us       5.651us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.576ms
Self CUDA time total: 100.963us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     987.019us       820.52%     987.019us     987.019us             1  
                                            torch_eager        11.12%     293.915us        99.79%       2.637ms       2.637ms       0.000us         0.00%     122.116us     122.116us             1  
                                              aten::mul         6.22%     164.251us        10.48%     276.937us      11.539us      61.922us        51.48%      61.922us       2.580us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.922us        51.48%      61.922us       2.580us            24  
                                            aten::copy_         3.96%     104.584us        67.08%       1.772ms      98.461us      39.265us        32.64%      41.089us       2.283us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.833us        23.97%      28.833us       2.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.105us        15.88%      19.105us       1.592us            12  
                                            aten::clone         0.81%      21.321us        64.15%       1.695ms     282.483us       0.000us         0.00%      12.256us       2.043us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.67%      10.432us       1.739us             6  
                                              aten::sub         1.58%      41.691us         2.56%      67.622us      11.270us       9.569us         7.95%       9.569us       1.595us             6  
                                              aten::add         1.31%      34.540us         2.17%      57.381us       9.563us       9.536us         7.93%       9.536us       1.589us             6  
                                Activity Buffer Request        53.87%       1.423ms        53.87%       1.423ms       1.423ms       1.824us         1.52%       1.824us       1.824us             1  
                                    aten::empty_strided         1.17%      30.940us         1.17%      30.940us       5.157us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.97%     184.193us         6.97%     184.193us      30.699us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.59%      94.920us         4.40%     116.150us       4.840us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.80%      21.230us         0.80%      21.230us       0.885us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.38%     221.517us         8.38%     221.517us       4.615us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.631us         0.21%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.642ms
Self CUDA time total: 120.292us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.977us       547.62%     942.977us     942.977us             1  
                                            torch_eager        11.98%     313.186us        99.77%       2.608ms       2.608ms       0.000us         0.00%     175.043us     175.043us             1  
                                              aten::mul         5.92%     154.664us        10.07%     263.135us      10.964us      89.731us        52.11%      89.731us       3.739us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.731us        52.11%      89.731us       3.739us            24  
                                            aten::copy_         4.21%     110.022us        67.75%       1.771ms      98.397us      57.632us        33.47%      60.480us       3.360us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.640us        23.60%      40.640us       3.387us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        14.42%      24.832us       2.069us            12  
                                            aten::clone         1.00%      26.050us        64.65%       1.690ms     281.685us       0.000us         0.00%      19.840us       3.307us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.87%      16.992us       2.832us             6  
                                              aten::add         1.22%      32.012us         2.08%      54.302us       9.050us      12.416us         7.21%      12.416us       2.069us             6  
                                              aten::sub         1.48%      38.721us         2.41%      62.881us      10.480us      12.416us         7.21%      12.416us       2.069us             6  
                                Activity Buffer Request        54.20%       1.417ms        54.20%       1.417ms       1.417ms       2.848us         1.65%       2.848us       2.848us             1  
                                    aten::empty_strided         1.15%      30.180us         1.15%      30.180us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.98%     182.574us         6.98%     182.574us      30.429us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.56%      66.979us         3.34%      87.351us       3.640us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.78%      20.372us         0.78%      20.372us       0.849us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.28%     216.491us         8.28%     216.491us       4.510us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.23%       5.900us         0.23%       5.900us       5.900us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.614ms
Self CUDA time total: 172.195us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.334us       791.88%     954.334us     954.334us             1  
                                            torch_eager        21.12%     286.823us        99.60%       1.352ms       1.352ms       0.000us         0.00%     122.339us     122.339us             1  
                                              aten::mul        11.39%     154.733us        19.43%     263.854us      10.994us      61.889us        51.35%      61.889us       2.579us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.889us        51.35%      61.889us       2.579us            24  
                                            aten::copy_         8.06%     109.392us        38.94%     528.759us      29.376us      39.393us        32.69%      41.217us       2.290us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.95%      28.864us       2.405us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.233us        15.96%      19.233us       1.603us            12  
                                            aten::clone         1.54%      20.901us        32.67%     443.638us      73.940us       0.000us         0.00%      12.353us       2.059us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us         8.74%      10.529us       1.755us             6  
                                              aten::sub         2.93%      39.731us         4.81%      65.293us      10.882us       9.633us         7.99%       9.633us       1.606us             6  
                                              aten::add         2.54%      34.552us         4.77%      64.792us      10.799us       9.600us         7.97%       9.600us       1.600us             6  
                                Activity Buffer Request        12.72%     172.763us        12.72%     172.763us     172.763us       1.824us         1.51%       1.824us       1.824us             1  
                                    aten::empty_strided         2.32%      31.561us         2.32%      31.561us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.45%     182.623us        13.45%     182.623us      30.437us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.17%      70.140us         6.66%      90.481us       3.770us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.50%      20.341us         1.50%      20.341us       0.848us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.86%     228.904us        16.86%     228.904us       4.769us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.40%       5.490us         0.40%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.358ms
Self CUDA time total: 120.515us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     961.439us       559.06%     961.439us     961.439us             1  
                                            torch_eager        21.39%     301.083us        99.65%       1.403ms       1.403ms       0.000us         0.00%     174.821us     174.821us             1  
                                              aten::mul        10.92%     153.723us        18.79%     264.437us      11.018us      89.541us        52.07%      89.541us       3.731us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.541us        52.07%      89.541us       3.731us            24  
                                            aten::copy_         8.57%     120.662us        41.11%     578.630us      32.146us      57.631us        33.51%      60.479us       3.360us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.639us        23.63%      40.639us       3.387us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.801us        14.42%      24.801us       2.067us            12  
                                            aten::clone         1.49%      21.022us        33.99%     478.490us      79.748us       0.000us         0.00%      19.840us       3.307us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.88%      16.992us       2.832us             6  
                                              aten::add         2.26%      31.841us         3.85%      54.131us       9.022us      12.481us         7.26%      12.481us       2.080us             6  
                                              aten::sub         2.79%      39.260us         4.52%      63.691us      10.615us      12.320us         7.16%      12.320us       2.053us             6  
                                Activity Buffer Request        15.02%     211.404us        15.02%     211.404us     211.404us       2.848us         1.66%       2.848us       2.848us             1  
                                    aten::empty_strided         2.10%      29.500us         2.10%      29.500us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.01%     183.184us        13.01%     183.184us      30.531us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.96%      69.812us         6.41%      90.211us       3.759us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.45%      20.399us         1.45%      20.399us       0.850us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.69%     220.815us        15.69%     220.815us       4.600us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       4.890us         0.35%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.408ms
Self CUDA time total: 171.973us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     959.740us       338.81%     959.740us     959.740us             1  
                                            torch_eager        11.78%     309.495us        99.81%       2.622ms       2.622ms       0.000us         0.00%     301.248us     301.248us             1  
                                              aten::mul         5.80%     152.430us         9.98%     262.294us      10.929us     133.378us        47.09%     133.378us       5.557us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.378us        47.09%     133.378us       5.557us            24  
                                            aten::copy_         4.09%     107.511us        67.37%       1.770ms      98.338us     108.832us        38.42%     126.816us       7.045us            18  
                                            aten::clone         1.07%      28.041us        64.54%       1.696ms     282.603us       0.000us         0.00%      69.600us      11.600us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.216us        20.20%      57.216us       4.768us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.616us        18.22%      51.616us       8.603us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.054us        14.49%      41.054us       3.421us            12  
                                              aten::sub         1.57%      41.190us         2.52%      66.080us      11.013us      20.607us         7.27%      20.607us       3.434us             6  
                                              aten::add         1.56%      40.972us         2.46%      64.512us      10.752us      20.447us         7.22%      20.447us       3.408us             6  
                                Activity Buffer Request        53.79%       1.413ms        53.79%       1.413ms       1.413ms      17.984us         6.35%      17.984us      17.984us             1  
                                    aten::empty_strided         1.19%      31.311us         1.19%      31.311us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.14%     187.713us         7.14%     187.713us      31.285us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.66%      69.760us         3.44%      90.282us       3.762us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.78%      20.522us         0.78%      20.522us       0.855us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.37%     219.936us         8.37%     219.936us       4.582us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.111us         0.19%       5.111us       5.111us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.627ms
Self CUDA time total: 283.264us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.299us       170.17%     964.299us     964.299us             1  
                                            torch_eager        21.37%     289.253us        99.58%       1.348ms       1.348ms       0.000us         0.00%     590.419us     590.419us             1  
                                            aten::copy_         7.69%     104.123us        37.93%     513.450us      28.525us     274.106us        48.37%     297.849us      16.547us            18  
                                              aten::mul        11.75%     159.118us        20.07%     271.705us      11.321us     226.427us        39.96%     226.427us       9.434us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     226.427us        39.96%     226.427us       9.434us            24  
                                            aten::clone         1.55%      21.020us        32.53%     440.358us      73.393us       0.000us         0.00%     206.843us      34.474us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.100us        32.31%     183.100us      30.517us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.006us        16.06%      91.006us       7.584us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.143us        11.67%      66.143us       5.512us            12  
                                              aten::sub         3.06%      41.432us         4.99%      67.562us      11.260us      33.664us         5.94%      33.664us       5.611us             6  
                                              aten::add         2.43%      32.930us         4.17%      56.451us       9.408us      32.479us         5.73%      32.479us       5.413us             6  
                                Activity Buffer Request        11.95%     161.793us        11.95%     161.793us     161.793us      23.743us         4.19%      23.743us      23.743us             1  
                                    aten::empty_strided         2.85%      38.611us         2.85%      38.611us       6.435us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.59%     183.934us        13.59%     183.934us      30.656us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.13%      69.460us         6.64%      89.941us       3.748us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.51%      20.481us         1.51%      20.481us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.68%     225.838us        16.68%     225.838us       4.705us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.42%       5.710us         0.42%       5.710us       5.710us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.354ms
Self CUDA time total: 566.676us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.757us      1018.68%     940.757us     940.757us             1  
                                            torch_eager        20.92%     284.932us        99.61%       1.357ms       1.357ms       0.000us         0.00%      93.503us      93.503us             1  
                                              aten::mul        11.51%     156.743us        19.57%     266.566us      11.107us      49.664us        53.78%      49.664us       2.069us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.664us        53.78%      49.664us       2.069us            24  
                                            aten::copy_         7.76%     105.742us        39.84%     542.619us      30.146us      29.343us        31.77%      30.495us       1.694us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.528us        24.39%      22.528us       1.877us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.344us        14.45%      13.344us       1.112us            12  
                                            aten::clone         1.52%      20.734us        33.85%     461.099us      76.850us       0.000us         0.00%       7.967us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.815us         7.38%       6.815us       1.136us             6  
                                              aten::sub         2.96%      40.252us         4.79%      65.263us      10.877us       6.688us         7.24%       6.688us       1.115us             6  
                                              aten::add         2.34%      31.811us         3.99%      54.311us       9.052us       6.656us         7.21%       6.656us       1.109us             6  
                                Activity Buffer Request        14.09%     191.853us        14.09%     191.853us     191.853us       1.152us         1.25%       1.152us       1.152us             1  
                                    aten::empty_strided         2.30%      31.379us         2.30%      31.379us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.46%     183.403us        13.46%     183.403us      30.567us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.20%      70.859us         6.67%      90.910us       3.788us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.47%      20.051us         1.47%      20.051us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.08%     218.955us        16.08%     218.955us       4.562us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.39%       5.360us         0.39%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.362ms
Self CUDA time total: 92.351us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.706us       986.10%     945.706us     945.706us             1  
                                            torch_eager        12.18%     322.968us        99.79%       2.647ms       2.647ms       0.000us         0.00%      97.216us      97.216us             1  
                                              aten::mul         5.85%     155.091us         9.99%     264.924us      11.039us      50.947us        53.12%      50.947us       2.123us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      50.947us        53.12%      50.947us       2.123us            24  
                                            aten::copy_         3.92%     103.931us        67.30%       1.785ms      99.174us      30.783us        32.10%      32.095us       1.783us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        23.96%      22.976us       1.915us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.174us        14.78%      14.174us       1.181us            12  
                                            aten::clone         1.18%      31.280us        64.70%       1.716ms     286.035us       0.000us         0.00%       9.119us       1.520us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         8.14%       7.807us       1.301us             6  
                                              aten::add         1.22%      32.380us         2.09%      55.311us       9.219us       7.102us         7.41%       7.102us       1.184us             6  
                                              aten::sub         1.50%      39.882us         2.41%      63.892us      10.649us       7.072us         7.37%       7.072us       1.179us             6  
                                Activity Buffer Request        53.95%       1.431ms        53.95%       1.431ms       1.431ms       1.312us         1.37%       1.312us       1.312us             1  
                                    aten::empty_strided         1.23%      32.600us         1.23%      32.600us       5.433us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.05%     187.002us         7.05%     187.002us      31.167us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.63%      69.642us         3.43%      90.901us       3.788us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.80%      21.259us         0.80%      21.259us       0.886us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.29%     220.006us         8.29%     220.006us       4.583us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.569us         0.21%       5.569us       5.569us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.653ms
Self CUDA time total: 95.904us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.956us       929.78%     963.956us     963.956us             1  
                                            torch_eager        11.95%     315.942us        99.78%       2.637ms       2.637ms       0.000us         0.00%     104.988us     104.988us             1  
                                              aten::mul         6.01%     158.721us        10.21%     269.951us      11.248us      55.295us        53.33%      55.295us       2.304us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.295us        53.33%      55.295us       2.304us            24  
                                            aten::copy_         4.03%     106.403us        67.45%       1.783ms      99.031us      32.417us        31.27%      33.729us       1.874us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.607us        23.73%      24.607us       2.051us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.964us        15.40%      15.964us       1.330us            12  
                                            aten::clone         1.02%      26.870us        64.62%       1.708ms     284.615us       0.000us         0.00%       9.122us       1.520us             6  
                                              aten::add         1.23%      32.629us         2.10%      55.390us       9.232us       7.997us         7.71%       7.997us       1.333us             6  
                                              aten::sub         1.44%      38.041us         2.36%      62.260us      10.377us       7.967us         7.68%       7.967us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.810us         7.53%       7.810us       1.302us             6  
                                Activity Buffer Request        54.08%       1.429ms        54.08%       1.429ms       1.429ms       1.312us         1.27%       1.312us       1.312us             1  
                                    aten::empty_strided         1.27%      33.640us         1.27%      33.640us       5.607us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.95%     183.544us         6.95%     183.544us      30.591us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.64%      69.789us         3.42%      90.471us       3.770us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.78%      20.682us         0.78%      20.682us       0.862us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.39%     221.610us         8.39%     221.610us       4.617us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.22%       5.700us         0.22%       5.700us       5.700us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.643ms
Self CUDA time total: 103.676us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     933.942us       757.68%     933.942us     933.942us             1  
                                            torch_eager        21.17%     287.829us        99.59%       1.354ms       1.354ms       0.000us         0.00%     125.024us     125.024us             1  
                                              aten::mul        11.38%     154.770us        19.33%     262.774us      10.949us      64.862us        52.62%      64.862us       2.703us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.862us        52.62%      64.862us       2.703us            24  
                                            aten::copy_         7.76%     105.560us        40.17%     546.058us      30.337us      39.265us        31.85%      41.025us       2.279us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.865us        23.42%      28.865us       2.405us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.137us        15.53%      19.137us       1.595us            12  
                                            aten::clone         1.51%      20.520us        34.08%     463.317us      77.220us       0.000us         0.00%      12.160us       2.027us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.44%      10.400us       1.733us             6  
                                              aten::sub         2.90%      39.471us         4.67%      63.511us      10.585us       9.569us         7.76%       9.569us       1.595us             6  
                                              aten::add         2.50%      34.030us         4.22%      57.431us       9.572us       9.568us         7.76%       9.568us       1.595us             6  
                                Activity Buffer Request        14.30%     194.363us        14.30%     194.363us     194.363us       1.760us         1.43%       1.760us       1.760us             1  
                                    aten::empty_strided         2.23%      30.321us         2.23%      30.321us       5.053us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.45%     182.914us        13.45%     182.914us      30.486us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.84%      65.748us         6.29%      85.480us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.45%      19.732us         1.45%      19.732us       0.822us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.08%     218.666us        16.08%     218.666us       4.556us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.41%       5.560us         0.41%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.359ms
Self CUDA time total: 123.264us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.359us       900.66%     934.359us     934.359us             1  
                                            torch_eager        21.17%     286.322us        99.59%       1.347ms       1.347ms       0.000us         0.00%     105.086us     105.086us             1  
                                              aten::mul        11.62%     157.214us        19.66%     265.945us      11.081us      55.327us        53.33%      55.327us       2.305us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.327us        53.33%      55.327us       2.305us            24  
                                            aten::copy_         7.65%     103.495us        39.66%     536.482us      29.805us      32.511us        31.34%      33.855us       1.881us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        23.81%      24.704us       2.059us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.904us        15.33%      15.904us       1.325us            12  
                                            aten::clone         1.57%      21.280us        33.91%     458.650us      76.442us       0.000us         0.00%       9.151us       1.525us             6  
                                              aten::add         2.43%      32.883us         4.09%      55.372us       9.229us       8.001us         7.71%       8.001us       1.333us             6  
                                              aten::sub         2.87%      38.810us         4.64%      62.781us      10.463us       7.903us         7.62%       7.903us       1.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         7.53%       7.807us       1.301us             6  
                                Activity Buffer Request        14.06%     190.184us        14.06%     190.184us     190.184us       1.344us         1.30%       1.344us       1.344us             1  
                                    aten::empty_strided         2.22%      30.070us         2.22%      30.070us       5.012us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.39%     181.103us        13.39%     181.103us      30.184us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.05%      68.302us         6.56%      88.771us       3.699us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.51%      20.469us         1.51%      20.469us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.03%     216.891us        16.03%     216.891us       4.519us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.41%       5.591us         0.41%       5.591us       5.591us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.353ms
Self CUDA time total: 103.742us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.694us       764.03%     944.694us     944.694us             1  
                                            torch_eager        20.48%     287.824us        99.60%       1.400ms       1.400ms       0.000us         0.00%     125.438us     125.438us             1  
                                              aten::mul        10.91%     153.363us        18.83%     264.625us      11.026us      65.151us        52.69%      65.151us       2.715us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.151us        52.69%      65.151us       2.715us            24  
                                            aten::copy_         7.88%     110.793us        41.73%     586.532us      32.585us      39.328us        31.81%      41.120us       2.284us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.895us        23.37%      28.895us       2.408us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.167us        15.50%      19.167us       1.597us            12  
                                            aten::clone         1.52%      21.310us        35.87%     504.089us      84.015us       0.000us         0.00%      12.225us       2.038us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.433us         8.44%      10.433us       1.739us             6  
                                              aten::sub         2.80%      39.332us         4.57%      64.213us      10.702us       9.632us         7.79%       9.632us       1.605us             6  
                                              aten::add         2.33%      32.799us         3.97%      55.790us       9.298us       9.535us         7.71%       9.535us       1.589us             6  
                                Activity Buffer Request        15.08%     211.984us        15.08%     211.984us     211.984us       1.792us         1.45%       1.792us       1.792us             1  
                                    aten::empty_strided         2.18%      30.690us         2.18%      30.690us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.35%     201.734us        14.35%     201.734us      33.622us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.89%      68.724us         6.32%      88.851us       3.702us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.43%      20.127us         1.43%      20.127us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.74%     221.155us        15.74%     221.155us       4.607us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.40%       5.570us         0.40%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.405ms
Self CUDA time total: 123.646us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.077us       529.63%     938.077us     938.077us             1  
                                            torch_eager        22.00%     288.844us        99.57%       1.307ms       1.307ms       0.000us         0.00%     179.967us     179.967us             1  
                                              aten::mul        11.92%     156.562us        20.13%     264.245us      11.010us      94.881us        53.57%      94.881us       3.953us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.881us        53.57%      94.881us       3.953us            24  
                                            aten::copy_         8.04%     105.524us        37.72%     495.290us      27.516us      57.663us        32.56%      60.511us       3.362us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.672us        22.96%      40.672us       3.389us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.575us        13.87%      24.575us       2.048us            12  
                                            aten::clone         1.60%      21.071us        31.51%     413.758us      68.960us       0.000us         0.00%      19.839us       3.306us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.991us         9.59%      16.991us       2.832us             6  
                                              aten::add         2.42%      31.800us         4.16%      54.561us       9.093us      12.288us         6.94%      12.288us       2.048us             6  
                                              aten::sub         3.05%      40.090us         5.01%      65.752us      10.959us      12.287us         6.94%      12.287us       2.048us             6  
                                Activity Buffer Request        10.75%     141.113us        10.75%     141.113us     141.113us       2.848us         1.61%       2.848us       2.848us             1  
                                    aten::empty_strided         2.28%      29.940us         2.28%      29.940us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.21%     186.543us        14.21%     186.543us      31.091us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.18%      67.990us         6.68%      87.660us       3.652us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.50%      19.670us         1.50%      19.670us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.62%     218.216us        16.62%     218.216us       4.546us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.43%       5.650us         0.43%       5.650us       5.650us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.313ms
Self CUDA time total: 177.119us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.076us       318.26%     945.076us     945.076us             1  
                                            torch_eager        21.55%     289.808us        99.58%       1.339ms       1.339ms       0.000us         0.00%     314.171us     314.171us             1  
                                              aten::mul        11.43%     153.633us        19.62%     263.817us      10.992us     145.952us        49.15%     145.952us       6.081us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.952us        49.15%     145.952us       6.081us            24  
                                            aten::copy_         9.11%     122.489us        38.99%     524.297us      29.128us     110.173us        37.10%     127.389us       7.077us            18  
                                            aten::clone         1.65%      22.169us        33.13%     445.468us      74.245us       0.000us         0.00%      70.110us      11.685us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.279us        19.29%      57.279us       4.773us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.894us        17.81%      52.894us       8.816us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.830us        13.75%      40.830us       3.402us            12  
                                              aten::sub         2.94%      39.549us         4.81%      64.690us      10.782us      20.511us         6.91%      20.511us       3.418us             6  
                                              aten::add         2.41%      32.411us         4.09%      55.020us       9.170us      20.319us         6.84%      20.319us       3.386us             6  
                                Activity Buffer Request        11.32%     152.193us        11.32%     152.193us     152.193us      17.216us         5.80%      17.216us      17.216us             1  
                                    aten::empty_strided         2.31%      31.082us         2.31%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.88%     186.593us        13.88%     186.593us      31.099us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.09%      68.450us         6.56%      88.160us       3.673us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.47%      19.710us         1.47%      19.710us       0.821us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.43%     220.956us        16.43%     220.956us       4.603us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.42%       5.661us         0.42%       5.661us       5.661us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.345ms
Self CUDA time total: 296.955us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     986.080us       556.73%     986.080us     986.080us             1  
                                            torch_eager        12.52%     336.567us        99.81%       2.683ms       2.683ms       0.000us         0.00%     179.999us     179.999us             1  
                                              aten::mul         5.82%     156.365us         9.99%     268.575us      11.191us      94.976us        53.62%      94.976us       3.957us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.976us        53.62%      94.976us       3.957us            24  
                                            aten::copy_         3.98%     106.939us        67.04%       1.802ms     100.094us      57.535us        32.48%      60.415us       3.356us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.703us        22.98%      40.703us       3.392us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.608us        13.89%      24.608us       2.051us            12  
                                            aten::clone         1.08%      29.091us        64.22%       1.726ms     287.668us       0.000us         0.00%      19.712us       3.285us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.50%      16.832us       2.805us             6  
                                              aten::add         1.21%      32.499us         2.06%      55.240us       9.207us      12.320us         6.96%      12.320us       2.053us             6  
                                              aten::sub         1.59%      42.650us         2.57%      69.041us      11.507us      12.288us         6.94%      12.288us       2.048us             6  
                                Activity Buffer Request        53.52%       1.438ms        53.52%       1.438ms       1.438ms       2.880us         1.63%       2.880us       2.880us             1  
                                    aten::empty_strided         1.16%      31.221us         1.16%      31.221us       5.204us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.20%     193.473us         7.20%     193.473us      32.245us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.61%      70.195us         3.39%      91.232us       3.801us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.78%      21.037us         0.78%      21.037us       0.877us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.35%     224.324us         8.35%     224.324us       4.673us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       4.980us         0.19%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.688ms
Self CUDA time total: 177.119us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     955.007us       321.87%     955.007us     955.007us             1  
                                            torch_eager        21.61%     290.382us        99.58%       1.338ms       1.338ms       0.000us         0.00%     314.050us     314.050us             1  
                                              aten::mul        12.35%     165.965us        20.49%     275.388us      11.475us     146.274us        49.30%     146.274us       6.095us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.274us        49.30%     146.274us       6.095us            24  
                                            aten::copy_         7.99%     107.375us        38.18%     513.111us      28.506us     109.984us        37.07%     127.328us       7.074us            18  
                                            aten::clone         1.53%      20.570us        31.98%     429.868us      71.645us       0.000us         0.00%      70.048us      11.675us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.280us        19.31%      57.280us       4.773us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.704us        17.76%      52.704us       8.784us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.448us        13.63%      40.448us       3.371us            12  
                                              aten::sub         2.99%      40.150us         4.79%      64.400us      10.733us      20.288us         6.84%      20.288us       3.381us             6  
                                              aten::add         2.45%      32.907us         4.13%      55.499us       9.250us      20.160us         6.79%      20.160us       3.360us             6  
                                Activity Buffer Request        11.77%     158.223us        11.77%     158.223us     158.223us      17.344us         5.85%      17.344us      17.344us             1  
                                    aten::empty_strided         2.28%      30.711us         2.28%      30.711us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.78%     185.224us        13.78%     185.224us      30.871us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.13%      68.942us         6.58%      88.372us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.45%      19.430us         1.45%      19.430us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.26%     218.554us        16.26%     218.554us       4.553us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.42%       5.611us         0.42%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.344ms
Self CUDA time total: 296.706us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     962.939us       164.48%     962.939us     962.939us             1  
                                            torch_eager        21.30%     292.019us        99.59%       1.365ms       1.365ms       0.000us         0.00%     609.117us     609.117us             1  
                                            aten::copy_         7.59%     104.052us        39.10%     536.059us      29.781us     268.735us        45.90%     292.415us      16.245us            18  
                                              aten::mul        11.61%     159.130us        19.77%     271.083us      11.295us     251.454us        42.95%     251.454us      10.477us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.454us        42.95%     251.454us      10.477us            24  
                                            aten::clone         1.60%      21.919us        33.19%     455.067us      75.844us       0.000us         0.00%     201.504us      33.584us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.824us        30.37%     177.824us      29.637us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.911us        15.53%      90.911us       7.576us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.248us        11.15%      65.248us       5.437us            12  
                                              aten::sub         2.98%      40.869us         4.94%      67.700us      11.283us      32.703us         5.59%      32.703us       5.451us             6  
                                              aten::add         2.40%      32.850us         4.07%      55.841us       9.307us      32.545us         5.56%      32.545us       5.424us             6  
                                Activity Buffer Request        13.18%     180.724us        13.18%     180.724us     180.724us      23.680us         4.04%      23.680us      23.680us             1  
                                    aten::empty_strided         2.23%      30.541us         2.23%      30.541us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.71%     188.023us        13.71%     188.023us      31.337us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         5.13%      70.322us         6.59%      90.292us       3.762us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.46%      19.970us         1.46%      19.970us       0.832us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.41%     225.035us        16.41%     225.035us       4.688us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.41%       5.640us         0.41%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.371ms
Self CUDA time total: 585.437us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         9.18%     318.848us        77.56%       2.693ms       2.693ms       0.000us         0.00%       1.840ms       1.840ms             1  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.811ms       102.06%       1.811ms       1.811ms             1  
                                            aten::copy_         3.19%     110.682us        53.02%       1.841ms     102.257us     792.737us        44.68%     858.369us      47.687us            18  
                                              aten::mul         4.39%     152.554us         7.57%     262.845us      10.952us     833.316us        46.97%     833.316us      34.721us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     833.316us        46.97%     833.316us      34.721us            24  
                                            aten::clone         0.79%      27.538us        50.82%       1.764ms     294.050us       0.000us         0.00%     624.865us     104.144us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     559.233us        31.52%     559.233us      93.206us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.504us        13.16%     233.504us      19.459us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     148.032us         8.34%     148.032us      12.336us            12  
                                              aten::sub         1.13%      39.132us         1.88%      65.111us      10.852us      90.112us         5.08%      90.112us      15.019us             6  
                                Activity Buffer Request        41.37%       1.436ms        41.37%       1.436ms       1.436ms      65.632us         3.70%      65.632us      65.632us             1  
                                              aten::add         0.97%      33.650us         1.61%      56.062us       9.344us      57.920us         3.26%      57.920us       9.653us             6  
                                    aten::empty_strided         0.92%      31.941us         0.92%      31.941us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.62%     229.834us         6.62%     229.834us      38.306us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.00%      69.363us         2.59%      89.831us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.59%      20.468us         0.59%      20.468us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         6.41%     222.613us         6.41%     222.613us       4.638us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize        22.44%     778.913us        22.44%     778.913us     778.913us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.472ms
Self CUDA time total: 1.774ms


impl                     wl                  p50(ms)  ok
torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S2048_H8_D128_R64     0.23  True
torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True

Artifacts:

rotary.jsonl