# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 4 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.90% 197.042us 46.64% 1.877ms 1.877ms 0.000us 0.00% 3.132ms 3.132ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.74% 69.952us 41.15% 1.656ms 551.934us 2.385ms 100.00% 3.132ms 1.044ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.386ms 100.06% 2.386ms 2.386ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 100.00% 2.385ms 794.945us 3
Activity Buffer Request 37.01% 1.489ms 37.01% 1.489ms 1.489ms 747.170us 31.33% 747.170us 747.170us 1
aten::view 0.59% 23.780us 0.59% 23.780us 3.963us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.17% 47.212us 1.17% 47.212us 5.246us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.090us 0.23% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.00% 40.411us 1.00% 40.411us 13.470us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 53.36% 2.147ms 53.36% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.023ms
Self CUDA time total: 2.385ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.41% 154.482us 27.38% 1.753ms 1.753ms 0.000us 0.00% 6.413ms 6.413ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.72% 46.409us 24.77% 1.586ms 528.643us 4.824ms 100.00% 6.413ms 2.138ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.826ms 100.03% 4.826ms 4.826ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.824ms 100.00% 4.824ms 1.608ms 3
Activity Buffer Request 23.06% 1.476ms 23.06% 1.476ms 1.476ms 1.588ms 32.92% 1.588ms 1.588ms 1
aten::view 0.20% 12.531us 0.20% 12.531us 2.089us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.47% 30.283us 0.47% 30.283us 3.365us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.150us 0.08% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.43% 27.650us 0.43% 27.650us 9.217us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 72.62% 4.650ms 72.62% 4.650ms 4.650ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.403ms
Self CUDA time total: 4.824ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.19% 139.552us 27.69% 1.763ms 1.763ms 0.000us 0.00% 6.329ms 6.329ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.72% 45.651us 25.31% 1.612ms 537.326us 4.772ms 100.00% 6.329ms 2.110ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.03% 4.774ms 4.774ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.772ms 100.00% 4.772ms 1.591ms 3
Activity Buffer Request 23.61% 1.504ms 23.61% 1.504ms 1.504ms 1.557ms 32.63% 1.557ms 1.557ms 1
aten::view 0.19% 11.951us 0.19% 11.951us 1.992us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.48% 30.520us 0.48% 30.520us 3.391us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.131us 0.08% 5.131us 1.710us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.42% 26.970us 0.42% 26.970us 8.990us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 72.31% 4.606ms 72.31% 4.606ms 4.606ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.370ms
Self CUDA time total: 4.772ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.25% 143.461us 17.42% 1.995ms 1.995ms 0.000us 0.00% 12.814ms 12.814ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.40% 45.652us 16.06% 1.839ms 613.131us 9.628ms 100.00% 12.814ms 4.271ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.01% 9.629ms 9.629ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.00% 9.628ms 3.209ms 3
Activity Buffer Request 12.97% 1.486ms 12.97% 1.486ms 1.486ms 3.186ms 33.09% 3.186ms 3.186ms 1
aten::view 0.11% 12.411us 0.11% 12.411us 2.069us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.27% 31.101us 0.27% 31.101us 3.456us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.010us 0.04% 5.010us 1.670us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.37% 271.915us 2.37% 271.915us 90.638us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 82.58% 9.458ms 82.58% 9.458ms 9.458ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 11.453ms
Self CUDA time total: 9.628ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.24it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.49it/s]