# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 11.46% 506.438us 53.66% 2.372ms 2.372ms 0.000us 0.00% 3.500ms 3.500ms 1
xformers_flash3::flash_fwd 4.48% 198.083us 41.44% 1.831ms 610.487us 0.000us 0.00% 3.500ms 1.167ms 3
flash_attn_3::fwd 1.73% 76.649us 36.96% 1.633ms 544.459us 2.610ms 100.00% 3.500ms 1.167ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 100.06% 2.612ms 2.612ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.610ms 100.00% 2.610ms 870.154us 3
Activity Buffer Request 33.26% 1.470ms 33.26% 1.470ms 1.470ms 889.248us 34.06% 889.248us 889.248us 1
aten::empty 0.80% 35.182us 0.80% 35.182us 5.864us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.25% 10.920us 0.25% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.92% 40.501us 0.92% 40.501us 13.500us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.27% 12.132us 0.77% 33.872us 5.645us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.49% 21.740us 0.49% 21.740us 3.623us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 46.34% 2.048ms 46.34% 2.048ms 2.048ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.420ms
Self CUDA time total: 2.610ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 7.25% 318.297us 46.47% 2.042ms 2.042ms 0.000us 0.00% 3.722ms 3.722ms 1
xformers_flash3::flash_fwd 3.37% 148.131us 38.68% 1.699ms 566.453us 0.000us 0.00% 3.722ms 1.241ms 3
flash_attn_3::fwd 1.17% 51.450us 35.31% 1.551ms 517.076us 2.780ms 100.00% 3.722ms 1.241ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.782ms 100.05% 2.782ms 2.782ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.780ms 100.00% 2.780ms 926.692us 3
Activity Buffer Request 32.58% 1.431ms 32.58% 1.431ms 1.431ms 942.244us 33.89% 942.244us 942.244us 1
aten::empty 0.66% 29.210us 0.66% 29.210us 4.868us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 5.512us 0.13% 5.512us 1.837us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.77% 34.031us 0.77% 34.031us 11.344us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.21% 9.369us 0.54% 23.900us 3.983us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.33% 14.531us 0.33% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 53.53% 2.351ms 53.53% 2.351ms 2.351ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.393ms
Self CUDA time total: 2.780ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.91% 309.504us 45.24% 2.025ms 2.025ms 0.000us 0.00% 3.854ms 3.854ms 1
xformers_flash3::flash_fwd 3.30% 147.756us 37.80% 1.692ms 563.990us 0.000us 0.00% 3.854ms 1.285ms 3
flash_attn_3::fwd 1.19% 53.048us 34.50% 1.544ms 514.738us 2.875ms 100.00% 3.854ms 1.285ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.877ms 100.05% 2.877ms 2.877ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.875ms 100.00% 2.875ms 958.381us 3
Activity Buffer Request 31.77% 1.422ms 31.77% 1.422ms 1.422ms 979.266us 34.06% 979.266us 979.266us 1
aten::empty 0.67% 29.790us 0.67% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.570us 0.12% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.76% 33.852us 0.76% 33.852us 11.284us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.22% 9.920us 0.53% 23.660us 3.943us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.76% 2.451ms 54.76% 2.451ms 2.451ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.476ms
Self CUDA time total: 2.875ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.53% 306.895us 47.96% 2.255ms 2.255ms 0.000us 0.00% 3.838ms 3.838ms 1
xformers_flash3::flash_fwd 3.09% 145.243us 40.94% 1.925ms 641.651us 0.000us 0.00% 3.838ms 1.279ms 3
flash_attn_3::fwd 1.17% 55.062us 37.85% 1.780ms 593.237us 2.865ms 100.00% 3.838ms 1.279ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.866ms 100.05% 2.866ms 2.866ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.865ms 100.00% 2.865ms 954.931us 3
Activity Buffer Request 30.23% 1.421ms 30.23% 1.421ms 1.421ms 973.182us 33.97% 973.182us 973.182us 1
aten::empty 0.63% 29.790us 0.63% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.390us 0.11% 5.390us 1.797us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.70% 268.094us 5.70% 268.094us 89.365us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 8.710us 0.49% 22.930us 3.822us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 14.220us 0.30% 14.220us 2.370us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 52.04% 2.447ms 52.04% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.702ms
Self CUDA time total: 2.865ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.46% 328.735us 43.31% 2.206ms 2.206ms 0.000us 0.00% 4.477ms 4.477ms 1
xformers_flash3::flash_fwd 3.06% 155.642us 36.36% 1.852ms 617.231us 0.000us 0.00% 4.477ms 1.492ms 3
flash_attn_3::fwd 1.12% 56.881us 33.30% 1.696ms 565.350us 3.348ms 100.00% 4.477ms 1.492ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.349ms 100.04% 3.349ms 3.349ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.348ms 100.00% 3.348ms 1.116ms 3
Activity Buffer Request 27.91% 1.421ms 27.91% 1.421ms 1.421ms 1.129ms 33.72% 1.129ms 1.129ms 1
aten::empty 0.63% 32.251us 0.63% 32.251us 5.375us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.740us 0.11% 5.740us 1.913us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.53% 179.913us 3.53% 179.913us 59.971us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.21% 10.692us 0.50% 25.231us 4.205us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 14.539us 0.29% 14.539us 2.423us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 56.69% 2.887ms 56.69% 2.887ms 2.887ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.092ms
Self CUDA time total: 3.348ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.24% 320.533us 43.45% 2.233ms 2.233ms 0.000us 0.00% 4.496ms 4.496ms 1
xformers_flash3::flash_fwd 2.90% 149.124us 36.73% 1.887ms 629.094us 0.000us 0.00% 4.496ms 1.499ms 3
flash_attn_3::fwd 1.48% 76.290us 33.83% 1.738ms 579.386us 3.368ms 100.00% 4.496ms 1.499ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.05% 3.369ms 3.369ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.00% 3.368ms 1.123ms 3
Activity Buffer Request 28.33% 1.456ms 28.33% 1.456ms 1.456ms 1.129ms 33.51% 1.129ms 1.129ms 1
aten::empty 0.58% 29.962us 0.58% 29.962us 4.994us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 6.240us 0.12% 6.240us 2.080us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.31% 169.832us 3.31% 169.832us 56.611us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.21% 10.672us 0.48% 24.873us 4.146us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 14.201us 0.28% 14.201us 2.367us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 56.55% 2.906ms 56.55% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.138ms
Self CUDA time total: 3.368ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.98 True
xformers_meff cuda_attn_L256_bfloat16 1.02 True
xformers_meff cuda_attn_L320_bfloat16 1.07 True
xformers_meff cuda_attn_L384_bfloat16 1.08 True
xformers_meff cuda_attn_L448_bfloat16 1.24 True
xformers_meff cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs