# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.74% 162.312us 41.68% 1.808ms 1.808ms 0.000us 0.00% 3.686ms 3.686ms 1
_flash_attn_9e27194::fwd 1.67% 72.360us 37.94% 1.646ms 548.560us 2.753ms 100.00% 3.686ms 1.229ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.753ms 100.00% 2.753ms 917.639us 3
Activity Buffer Request 33.08% 1.435ms 33.08% 1.435ms 1.435ms 933.501us 33.91% 933.501us 933.501us 1
cudaDeviceGetAttribute 0.12% 5.209us 0.12% 5.209us 0.347us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.42% 18.210us 1.24% 53.790us 17.930us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.82% 35.580us 0.82% 35.580us 11.860us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.58% 25.153us 0.58% 25.153us 2.795us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.26% 11.441us 0.26% 11.441us 3.814us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.99% 42.781us 0.99% 42.781us 14.260us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.32% 2.530ms 58.32% 2.530ms 2.530ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.338ms
Self CUDA time total: 2.753ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.52% 113.464us 37.14% 1.670ms 1.670ms 0.000us 0.00% 3.984ms 3.984ms 1
_flash_attn_9e27194::fwd 1.10% 49.632us 34.61% 1.557ms 518.855us 2.977ms 100.00% 3.984ms 1.328ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.979ms 100.05% 2.979ms 2.979ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.977ms 100.00% 2.977ms 992.348us 3
Activity Buffer Request 31.69% 1.425ms 31.69% 1.425ms 1.425ms 1.007ms 33.82% 1.007ms 1.007ms 1
cudaDeviceGetAttribute 0.08% 3.769us 0.08% 3.769us 0.251us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.17% 7.560us 0.54% 24.080us 8.027us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 16.520us 0.37% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 21.170us 0.47% 21.170us 2.352us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.820us 0.08% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.64% 28.910us 0.64% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.86% 2.827ms 62.86% 2.827ms 2.827ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.497ms
Self CUDA time total: 2.977ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.39% 108.133us 36.58% 1.655ms 1.655ms 0.000us 0.00% 4.040ms 4.040ms 1
_flash_attn_9e27194::fwd 1.06% 48.029us 34.19% 1.547ms 515.608us 3.016ms 100.00% 4.040ms 1.347ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.017ms 100.05% 3.017ms 3.017ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.00% 3.016ms 1.005ms 3
Activity Buffer Request 31.28% 1.415ms 31.28% 1.415ms 1.415ms 1.024ms 33.96% 1.024ms 1.024ms 1
cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.121us 0.52% 23.411us 7.804us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.36% 16.290us 0.36% 16.290us 5.430us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.49% 22.080us 0.49% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.66% 29.710us 0.66% 29.710us 9.903us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.42% 2.870ms 63.42% 2.870ms 2.870ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.525ms
Self CUDA time total: 3.016ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.32% 109.992us 39.04% 1.848ms 1.848ms 0.000us 0.00% 4.060ms 4.060ms 1
_flash_attn_9e27194::fwd 1.05% 49.564us 36.71% 1.738ms 579.317us 3.035ms 100.00% 4.060ms 1.353ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 100.05% 3.037ms 3.037ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 100.00% 3.035ms 1.012ms 3
Activity Buffer Request 29.72% 1.407ms 29.72% 1.407ms 1.407ms 1.025ms 33.76% 1.025ms 1.025ms 1
cudaDeviceGetAttribute 0.08% 3.690us 0.08% 3.690us 0.246us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.770us 0.54% 25.380us 8.460us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 17.610us 0.37% 17.610us 5.870us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 22.139us 0.47% 22.139us 2.460us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.790us 0.08% 3.790us 1.263us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.78% 226.343us 4.78% 226.343us 75.448us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.96% 2.886ms 60.96% 2.886ms 2.886ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.734ms
Self CUDA time total: 3.035ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.11% 110.542us 35.45% 1.860ms 1.860ms 0.000us 0.00% 4.719ms 4.719ms 1
_flash_attn_9e27194::fwd 0.97% 51.080us 33.34% 1.750ms 583.220us 3.535ms 100.00% 4.719ms 1.573ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.535ms 100.00% 3.535ms 1.178ms 3
Activity Buffer Request 27.95% 1.467ms 27.95% 1.467ms 1.467ms 1.184ms 33.49% 1.184ms 1.184ms 1
cudaDeviceGetAttribute 0.07% 3.640us 0.07% 3.640us 0.243us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 7.520us 0.47% 24.731us 8.244us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.33% 17.211us 0.33% 17.211us 5.737us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.43% 22.670us 0.43% 22.670us 2.519us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.800us 0.07% 3.800us 1.267us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.37% 176.824us 3.37% 176.824us 58.941us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 64.55% 3.388ms 64.55% 3.388ms 3.388ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.248ms
Self CUDA time total: 3.535ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.24% 118.861us 34.58% 1.832ms 1.832ms 0.000us 0.00% 4.834ms 4.834ms 1
_flash_attn_9e27194::fwd 0.90% 47.900us 32.34% 1.713ms 571.163us 3.618ms 100.00% 4.834ms 1.611ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.619ms 100.04% 3.619ms 3.619ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3
Activity Buffer Request 27.32% 1.448ms 27.32% 1.448ms 1.448ms 1.217ms 33.63% 1.217ms 1.217ms 1
cudaDeviceGetAttribute 0.07% 3.661us 0.07% 3.661us 0.244us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 7.320us 0.50% 26.231us 8.744us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.36% 18.911us 0.36% 18.911us 6.304us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.40% 21.351us 0.40% 21.351us 2.372us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 4.160us 0.08% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.07% 162.463us 3.07% 162.463us 54.154us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 65.42% 3.466ms 65.42% 3.466ms 3.466ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.299ms
Self CUDA time total: 3.618ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:02, 8.29it/s]
Fetching 20 files: 10%|█ | 2/20 [00:06<01:08, 3.82s/it]
Fetching 20 files: 100%|██████████| 20/20 [00:06<00:00, 3.06it/s]