# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 4 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.99% 214.535us 50.40% 2.165ms 2.165ms 0.000us 0.00% 3.089ms 3.089ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.39% 59.840us 44.89% 1.928ms 642.793us 2.355ms 100.00% 3.089ms 1.030ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.357ms 100.06% 2.357ms 2.357ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.355ms 100.00% 2.355ms 785.131us 3
Activity Buffer Request 41.22% 1.771ms 41.22% 1.771ms 1.771ms 733.313us 31.13% 733.313us 733.313us 1
aten::view 0.51% 21.919us 0.51% 21.919us 3.653us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.06% 45.591us 1.06% 45.591us 5.066us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.340us 0.22% 9.340us 3.113us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.00% 42.910us 1.00% 42.910us 14.303us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 49.60% 2.131ms 49.60% 2.131ms 2.131ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.295ms
Self CUDA time total: 2.355ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.21% 146.665us 30.12% 2.003ms 2.003ms 0.000us 0.00% 6.394ms 6.394ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.64% 42.811us 27.74% 1.845ms 614.956us 4.819ms 100.00% 6.394ms 2.131ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.820ms 100.03% 4.820ms 4.820ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.819ms 100.00% 4.819ms 1.606ms 3
Activity Buffer Request 26.14% 1.739ms 26.14% 1.739ms 1.739ms 1.575ms 32.69% 1.575ms 1.575ms 1
aten::view 0.18% 11.889us 0.18% 11.889us 1.981us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.44% 29.319us 0.44% 29.319us 3.258us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.690us 0.07% 4.690us 1.563us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.44% 29.150us 0.44% 29.150us 9.717us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 69.88% 4.648ms 69.88% 4.648ms 4.648ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.651ms
Self CUDA time total: 4.819ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.00% 133.492us 30.10% 2.007ms 2.007ms 0.000us 0.00% 6.406ms 6.406ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.67% 44.942us 27.93% 1.863ms 620.970us 4.818ms 100.00% 6.406ms 2.135ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.819ms 100.03% 4.819ms 4.819ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.818ms 100.00% 4.818ms 1.606ms 3
Activity Buffer Request 26.34% 1.756ms 26.34% 1.756ms 1.756ms 1.588ms 32.97% 1.588ms 1.588ms 1
aten::view 0.16% 10.780us 0.16% 10.780us 1.797us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.44% 29.582us 0.44% 29.582us 3.287us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.759us 0.07% 4.759us 1.586us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.41% 27.190us 0.41% 27.190us 9.063us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 69.90% 4.662ms 69.90% 4.662ms 4.662ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.669ms
Self CUDA time total: 4.818ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.10% 128.730us 19.25% 2.252ms 2.252ms 0.000us 0.00% 12.776ms 12.776ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.142us 18.05% 2.112ms 704.015us 9.608ms 100.00% 12.776ms 4.259ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.609ms 100.01% 9.609ms 9.609ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.608ms 100.00% 9.608ms 3.203ms 3
Activity Buffer Request 15.07% 1.763ms 15.07% 1.763ms 1.763ms 3.168ms 32.98% 3.168ms 3.168ms 1
aten::view 0.10% 11.611us 0.10% 11.611us 1.935us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.25% 29.429us 0.25% 29.429us 3.270us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.891us 0.04% 4.891us 1.630us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.31% 270.775us 2.31% 270.775us 90.258us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 80.75% 9.448ms 80.75% 9.448ms 9.448ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 11.700ms
Self CUDA time total: 9.608ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.12it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.24it/s]