# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 4.02% 170.054us 45.66% 1.931ms 1.931ms 0.000us 0.00% 3.489ms 3.489ms 1
FlashAttnFunc 2.98% 126.112us 41.64% 1.761ms 586.890us 0.000us 0.00% 3.489ms 1.163ms 3
_flash_attn3_48fe103_dirty::fwd 1.85% 78.440us 38.65% 1.635ms 544.853us 2.605ms 100.00% 3.489ms 1.163ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.606ms 100.06% 2.606ms 2.606ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 100.00% 2.605ms 868.221us 3
Activity Buffer Request 34.45% 1.457ms 34.45% 1.457ms 1.457ms 884.680us 33.97% 884.680us 884.680us 1
aten::empty 1.07% 45.402us 1.07% 45.402us 7.567us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.29% 12.202us 0.29% 12.202us 4.067us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.99% 41.761us 0.99% 41.761us 13.920us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.34% 2.298ms 54.34% 2.298ms 2.298ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.229ms
Self CUDA time total: 2.605ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.90% 125.133us 41.34% 1.782ms 1.782ms 0.000us 0.00% 3.684ms 3.684ms 1
FlashAttnFunc 2.10% 90.312us 38.43% 1.657ms 552.206us 0.000us 0.00% 3.684ms 1.228ms 3
_flash_attn3_48fe103_dirty::fwd 1.24% 53.461us 36.34% 1.566ms 522.102us 2.755ms 100.00% 3.684ms 1.228ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.06% 2.756ms 2.756ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.309us 3
Activity Buffer Request 33.60% 1.448ms 33.60% 1.448ms 1.448ms 929.157us 33.73% 929.157us 929.157us 1
aten::empty 0.64% 27.380us 0.64% 27.380us 4.563us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 5.449us 0.13% 5.449us 1.816us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.74% 31.802us 0.74% 31.802us 10.601us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.66% 2.529ms 58.66% 2.529ms 2.529ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.310ms
Self CUDA time total: 2.755ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.81% 125.615us 39.44% 1.762ms 1.762ms 0.000us 0.00% 3.917ms 3.917ms 1
FlashAttnFunc 2.03% 90.880us 36.63% 1.637ms 545.546us 0.000us 0.00% 3.917ms 1.306ms 3
_flash_attn3_48fe103_dirty::fwd 1.20% 53.572us 34.59% 1.546ms 515.252us 2.927ms 100.00% 3.917ms 1.306ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.05% 2.928ms 2.928ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.593us 3
Activity Buffer Request 31.96% 1.428ms 31.96% 1.428ms 1.428ms 990.441us 33.84% 990.441us 990.441us 1
aten::empty 0.63% 27.950us 0.63% 27.950us 4.658us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.340us 0.12% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.68% 30.562us 0.68% 30.562us 10.187us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.56% 2.706ms 60.56% 2.706ms 2.706ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.469ms
Self CUDA time total: 2.927ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.73% 126.513us 42.04% 1.948ms 1.948ms 0.000us 0.00% 3.892ms 3.892ms 1
FlashAttnFunc 2.03% 94.184us 39.31% 1.821ms 607.134us 0.000us 0.00% 3.892ms 1.297ms 3
_flash_attn3_48fe103_dirty::fwd 1.14% 52.959us 37.28% 1.727ms 575.740us 2.906ms 100.00% 3.892ms 1.297ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.908ms 100.05% 2.908ms 2.908ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.728us 3
Activity Buffer Request 30.69% 1.422ms 30.69% 1.422ms 1.422ms 985.540us 33.91% 985.540us 985.540us 1
aten::empty 0.63% 29.361us 0.63% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.241us 0.11% 5.241us 1.747us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.70% 217.965us 4.70% 217.965us 72.655us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 57.96% 2.685ms 57.96% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.633ms
Self CUDA time total: 2.906ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.33% 120.764us 37.09% 1.922ms 1.922ms 0.000us 0.00% 4.645ms 4.645ms 1
FlashAttnFunc 1.78% 92.240us 34.76% 1.801ms 600.384us 0.000us 0.00% 4.645ms 1.548ms 3
_flash_attn3_48fe103_dirty::fwd 1.04% 53.829us 32.98% 1.709ms 569.637us 3.482ms 100.00% 4.645ms 1.548ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.483ms 100.04% 3.483ms 3.483ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.482ms 100.00% 3.482ms 1.161ms 3
Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.40% 1.163ms 1.163ms 1
aten::empty 0.54% 28.012us 0.54% 28.012us 4.669us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.211us 0.10% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.50% 181.305us 3.50% 181.305us 60.435us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.91% 3.260ms 62.91% 3.260ms 3.260ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.182ms
Self CUDA time total: 3.482ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.54% 130.883us 37.28% 1.924ms 1.924ms 0.000us 0.00% 4.633ms 4.633ms 1
FlashAttnFunc 1.80% 93.033us 34.74% 1.793ms 597.564us 0.000us 0.00% 4.633ms 1.544ms 3
_flash_attn3_48fe103_dirty::fwd 1.02% 52.583us 32.94% 1.700ms 566.553us 3.468ms 100.00% 4.633ms 1.544ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.04% 3.469ms 3.469ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
Activity Buffer Request 27.99% 1.444ms 27.99% 1.444ms 1.444ms 1.165ms 33.61% 1.165ms 1.165ms 1
aten::empty 0.56% 29.150us 0.56% 29.150us 4.858us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.050us 0.10% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.27% 168.763us 3.27% 168.763us 56.254us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.72% 3.236ms 62.72% 3.236ms 3.236ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.160ms
Self CUDA time total: 3.468ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.35it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.71it/s]