From 0632239fb6a1e3a04f483f90dbea5ea4a7de7a95 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 22 Jun 2026 17:37:19 +0200 Subject: [PATCH 1/5] Add `alloca` intrinsic for per-workitem stack scratch Introduce `GPUCompiler.alloca(::Type{T}, ::Val{N})::Ptr{T}`, which hands device code a fixed-size, per-workitem stack scratch buffer for `N` elements of `T`. This is meant to replace abstractions like KernelAbstractions' `@private` `MArray`-backed scratchpad with a direct stack allocation. Emitting the `alloca` through `llvmcall` directly is unsound/ineffective: the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target stack address space (e.g. addrspace 5 on NVPTX/AMDGPU) isn't known at the front end, and the LangRef lifetime of the `alloca` is tied to the inlined `llvmcall` wrapper. Instead, the front end emits a `julia.gpu.alloca..` intrinsic that `lower_alloca!` (run from `irgen`, before the optimizer) materializes as a real entry-block `alloca` in the datalayout's alloca address space, cast back to generic. Running before optimization lets the slot be promoted just like the mutable stack allocations Julia already emits. `T` must be `isbits`. Co-Authored-By: Claude Opus 4.8 --- src/irgen.jl | 123 +++++++++++++++++++++++++++++++++++++++++++++++++ test/native.jl | 41 +++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/src/irgen.jl b/src/irgen.jl index 80dac84d..c8979e0d 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -149,6 +149,10 @@ function irgen(@nospecialize(job::CompilerJob)) # the job's configured level, so device code can branch on it as a compile-time # constant that is part of the cache key (unlike reading the `-g` global directly). lower_debug_level!(job, mod) + + # materialize `GPUCompiler.alloca` intrinsics as real entry-block allocas, before the + # optimizer runs so the slots can be promoted (see `lower_alloca!`). + lower_alloca!(job, mod) end return mod, compiled, gv_to_value @@ -1216,6 +1220,125 @@ function lower_debug_level!(@nospecialize(job::CompilerJob), mod::LLVM.Module) return true end + +## stack allocation + +# device code can request a fixed-size, per-workitem stack scratch buffer via +# `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of +# `T`. this emits the `julia.gpu.alloca..` intrinsic, which `lower_alloca!` +# (run from `irgen`, before the optimizer) materializes as a real entry-block `alloca`. +# +# this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective: +# the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target +# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front end, and the +# LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it +# ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca +# address space, early enough for the optimizer to promote it. + +function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer) + name = "julia.gpu.alloca.$(bytes).$(align)" + intr = if haskey(functions(mod), name) + functions(mod)[name] + else + # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call + # must yield a distinct slot and must not be hoisted or CSE'd. + LLVM.Function(mod, name, LLVM.FunctionType(LLVM.PointerType())) + end + return intr +end + +# run-time equivalent: emits a call to the alloca intrinsic, returning a `Ptr{T}` to scratch +# storage for `N` elements of `T` (materialized by `lower_alloca!`). +function alloca_value(@nospecialize(T), N::Int) + isbitstype(T) || + error("GPUCompiler.alloca only supports `isbits` element types, got $T") + N >= 0 || throw(ArgumentError("GPUCompiler.alloca count must be non-negative, got $N")) + + bytes = sizeof(T) * N + align = Base.datatype_alignment(T) + + # a zero-byte allocation has no storage to point at; hand back a null pointer rather than + # emitting a degenerate 0-element alloca. + if bytes == 0 + return :(reinterpret(Ptr{$T}, C_NULL)) + end + + @dispose ctx=Context() begin + T_ptr = LLVM.PointerType() + + # create function + llvm_f, _ = create_function(T_ptr) + mod = LLVM.parent(llvm_f) + + # get intrinsic + intr = alloca_intr(mod, bytes, align) + intr_ft = function_type(intr) + + # generate IR + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") + position!(builder, entry) + + ptr = call!(builder, intr_ft, intr, Value[], "alloca") + + ret!(builder, ptr) + end + + call_function(llvm_f, Ptr{T}) + end +end + +# device-facing accessor: a `Ptr{T}` to per-workitem stack scratch for `N` elements of `T`. +# the storage is uninitialized and only valid within the calling kernel. `T` must be `isbits` +# (an `alloca` of GC-tracked references would be unrooted). intended as a building block for +# higher-level scratch abstractions (e.g. KernelAbstractions' `@private`). +@inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N) +export alloca + +# replace every `julia.gpu.alloca.*` call with an entry-block alloca in the containing function +function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module) + changed = false + prefix = "julia.gpu.alloca." + + for intr in collect(functions(mod)) + fn = LLVM.name(intr) + startswith(fn, prefix) || continue + + bytes, align = parse.(Int, split(fn[length(prefix)+1:end], '.')) + slot_typ = LLVM.ArrayType(LLVM.Int8Type(), bytes) + + for use in collect(uses(intr)) + call = user(use) + @assert call isa LLVM.CallInst + f = LLVM.parent(LLVM.parent(call)) + + @dispose builder=IRBuilder() begin + # materialize the slot at the top of the entry block so that it is a static + # alloca (promotable, and allocated once rather than per loop iteration). + position!(builder, first(instructions(first(blocks(f))))) + slot = alloca!(builder, slot_typ, "alloca") + alignment!(slot, align) + + # `alloca!` placed the slot in the datalayout's alloca address space; cast back + # to generic (AS 0) to match the `Ptr` the front end handed out. + ptr = if LLVM.addrspace(value_type(slot)) == 0 + slot + else + addrspacecast!(builder, slot, LLVM.PointerType()) + end + replace_uses!(call, ptr) + end + erase!(call) + changed = true + end + + @assert isempty(uses(intr)) + erase!(intr) + end + + return changed +end + # convert kernel state argument from pass-by-value to pass-by-reference # # the kernel state argument is always passed by value to avoid codegen issues with byval. diff --git a/test/native.jl b/test/native.jl index 95980c8e..613780de 100644 --- a/test/native.jl +++ b/test/native.jl @@ -739,3 +739,44 @@ end @test !occursin("deferred_codegen", ir) @test occursin("call void @julia_kernel", ir) end + +@testset "stack allocation intrinsic" begin + mod = @eval module $(gensym()) + import ..GPUCompiler + + function scratch(x) + p = GPUCompiler.alloca(Float32, Val(8)) + @inbounds unsafe_store!(p, x, 1) + @inbounds unsafe_store!(p, x, 8) + return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) + end + + # zero-element scratch yields a (null) pointer without emitting an alloca + empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) + end + + # the intrinsic is materialized as a single entry-block `alloca [8 x f32 = 32 x i8]`, + # and no `julia.gpu.alloca` call/declaration survives lowering. + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check "alloca [32 x i8], align 4" + @check_not "julia.gpu.alloca" + Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + end + + # once optimized the slot is promoted away entirely (result is x + x). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + Native.code_llvm(mod.scratch, Tuple{Float32}) + end + + # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. + @test @filecheck begin + @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + Native.code_llvm(mod.empty_scratch, Tuple{}) + end +end From e13a8c5b0c49e23769ea2aae308cfac1db6b7ddc Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 23 Jun 2026 09:21:43 +0200 Subject: [PATCH 2/5] cleanup --- src/irgen.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/irgen.jl b/src/irgen.jl index c8979e0d..78864221 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -1230,7 +1230,7 @@ end # # this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective: # the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target -# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front end, and the +# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front-end, and the # LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it # ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca # address space, early enough for the optimizer to promote it. @@ -1300,7 +1300,7 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module) changed = false prefix = "julia.gpu.alloca." - for intr in collect(functions(mod)) + for intr in functions(mod) fn = LLVM.name(intr) startswith(fn, prefix) || continue @@ -1320,7 +1320,7 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module) alignment!(slot, align) # `alloca!` placed the slot in the datalayout's alloca address space; cast back - # to generic (AS 0) to match the `Ptr` the front end handed out. + # to generic (AS 0) to match the `Ptr` `alloca` returns. ptr = if LLVM.addrspace(value_type(slot)) == 0 slot else From 8563d150efd97bb4a8d121adc62c5bc7bbffb9df Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 30 Jun 2026 13:21:03 +0200 Subject: [PATCH 3/5] Add stack allocation intrinsic tests for PTX, GCN, SPIRV, and Metal Test that lowers correctly for all GPU targets --- test/gcn.jl | 41 +++++++++++++++++++++++++++++++++++++++++ test/metal.jl | 41 +++++++++++++++++++++++++++++++++++++++++ test/ptx.jl | 41 +++++++++++++++++++++++++++++++++++++++++ test/spirv.jl | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) diff --git a/test/gcn.jl b/test/gcn.jl index 99dae742..1ec598fe 100644 --- a/test/gcn.jl +++ b/test/gcn.jl @@ -476,5 +476,46 @@ end GCN.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}}) end +@testset "stack allocation intrinsic" begin + mod = @eval module $(gensym()) + import ..GPUCompiler + + function scratch(x) + p = GPUCompiler.alloca(Float32, Val(8)) + @inbounds unsafe_store!(p, x, 1) + @inbounds unsafe_store!(p, x, 8) + return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) + end + + # zero-element scratch yields a (null) pointer without emitting an alloca + empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) + end + + # AMDGPU uses alloca address space 5, so the materialized slot lives in AS 5 and + # `lower_alloca!` emits an `addrspacecast` back to generic (AS 0). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check "alloca [32 x i8], align 4, addrspace(5)" + @check "addrspacecast" + @check_not "julia.gpu.alloca" + GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + end + + # once optimized the slot is promoted away entirely (result is x + x). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check_not "julia.gpu.alloca" + GCN.code_llvm(mod.scratch, Tuple{Float32}) + end + + # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. + @test @filecheck begin + @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + GCN.code_llvm(mod.empty_scratch, Tuple{}) + end +end + end end # :AMDGPU in LLVM.backends() diff --git a/test/metal.jl b/test/metal.jl index c1f8b369..02b0b818 100644 --- a/test/metal.jl +++ b/test/metal.jl @@ -1270,4 +1270,45 @@ end end end +@testset "stack allocation intrinsic" begin + mod = @eval module $(gensym()) + import ..GPUCompiler + + function scratch(x) + p = GPUCompiler.alloca(Float32, Val(8)) + @inbounds unsafe_store!(p, x, 1) + @inbounds unsafe_store!(p, x, 8) + return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) + end + + # zero-element scratch yields a (null) pointer without emitting an alloca + empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) + end + + # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, + # and no `julia.gpu.alloca` call/declaration survives lowering. + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check "alloca [32 x i8], align 4" + @check_not "julia.gpu.alloca" + Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + end + + # once optimized the slot is promoted away entirely (result is x + x). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + Metal.code_llvm(mod.scratch, Tuple{Float32}) + end + + # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. + @test @filecheck begin + @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + Metal.code_llvm(mod.empty_scratch, Tuple{}) + end +end + end diff --git a/test/ptx.jl b/test/ptx.jl index 2d93f3d5..3cd35bbc 100644 --- a/test/ptx.jl +++ b/test/ptx.jl @@ -137,6 +137,47 @@ end @test occursin("call void @julia_", ir) end +@testset "stack allocation intrinsic" begin + mod = @eval module $(gensym()) + import ..GPUCompiler + + function scratch(x) + p = GPUCompiler.alloca(Float32, Val(8)) + @inbounds unsafe_store!(p, x, 1) + @inbounds unsafe_store!(p, x, 8) + return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) + end + + # zero-element scratch yields a (null) pointer without emitting an alloca + empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) + end + + # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, + # and no `julia.gpu.alloca` call/declaration survives lowering. + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check "alloca [32 x i8], align 4" + @check_not "julia.gpu.alloca" + PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + end + + # once optimized the slot is promoted away entirely (result is x + x). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + PTX.code_llvm(mod.scratch, Tuple{Float32}) + end + + # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. + @test @filecheck begin + @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + PTX.code_llvm(mod.empty_scratch, Tuple{}) + end +end + end ############################################################################################ diff --git a/test/spirv.jl b/test/spirv.jl index b5006194..9b9bd247 100644 --- a/test/spirv.jl +++ b/test/spirv.jl @@ -203,4 +203,45 @@ end end end +@testset "stack allocation intrinsic" begin + mod = @eval module $(gensym()) + import ..GPUCompiler + + function scratch(x) + p = GPUCompiler.alloca(Float32, Val(8)) + @inbounds unsafe_store!(p, x, 1) + @inbounds unsafe_store!(p, x, 8) + return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) + end + + # zero-element scratch yields a (null) pointer without emitting an alloca + empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) + end + + # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, + # and no `julia.gpu.alloca` call/declaration survives lowering. + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check "alloca [32 x i8], align 4" + @check_not "julia.gpu.alloca" + SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false) + end + + # once optimized the slot is promoted away entirely (result is x + x). + @test @filecheck begin + @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend) + end + + # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. + @test @filecheck begin + @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" + @check_not "alloca" + @check_not "julia.gpu.alloca" + SPIRV.code_llvm(mod.empty_scratch, Tuple{}; backend) + end +end + end From 699d04c2d7f7c041781c3df9cf68cd3f72edf134 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 30 Jun 2026 13:57:06 +0200 Subject: [PATCH 4/5] Fix alloca intrinsic under typed pointers and older Julia versions 1. On Julia 1.10/1.11, LLVM runs in typed-pointers mode. PointerType() creates an opaque pointer in LLVM 15, causing assembly parsing errors on llvmcall since opaque pointers are disabled by default. Fix this by using PointerType(Int8Type()) when supports_typed_pointers() is true. 2. In lower_alloca!, if the pointer types or address spaces differ (which is common under typed pointers where the slot is [32 x i8]* but the call returns i8*), dynamically handle casting using bitcast! or addrspacecast! as appropriate. 3. On Julia 1.10/1.11, under optimize=false, the compiler compiles the llvmcall into a non-inlined helper function. Fix test checking by passing dump_module=true to code_llvm in the unoptimized checks to allow matching inside helper functions. 4. Dev the package in test/Project.toml so that Pkg loads the local repository package under Julia 1.10. --- src/irgen.jl | 13 +++++++++---- test/Project.toml | 3 +++ test/gcn.jl | 2 +- test/metal.jl | 2 +- test/native.jl | 2 +- test/ptx.jl | 2 +- test/spirv.jl | 2 +- 7 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/irgen.jl b/src/irgen.jl index 78864221..c391fbdc 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -1242,7 +1242,8 @@ function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer) else # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call # must yield a distinct slot and must not be hoisted or CSE'd. - LLVM.Function(mod, name, LLVM.FunctionType(LLVM.PointerType())) + T_ptr = LLVM.supports_typed_pointers(LLVM.context(mod)) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType() + LLVM.Function(mod, name, LLVM.FunctionType(T_ptr)) end return intr end @@ -1264,7 +1265,7 @@ function alloca_value(@nospecialize(T), N::Int) end @dispose ctx=Context() begin - T_ptr = LLVM.PointerType() + T_ptr = LLVM.supports_typed_pointers(ctx) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType() # create function llvm_f, _ = create_function(T_ptr) @@ -1321,10 +1322,14 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module) # `alloca!` placed the slot in the datalayout's alloca address space; cast back # to generic (AS 0) to match the `Ptr` `alloca` returns. - ptr = if LLVM.addrspace(value_type(slot)) == 0 + ptr = if value_type(slot) == value_type(call) slot else - addrspacecast!(builder, slot, LLVM.PointerType()) + if LLVM.addrspace(value_type(slot)) == LLVM.addrspace(value_type(call)) + bitcast!(builder, slot, value_type(call)) + else + addrspacecast!(builder, slot, value_type(call)) + end end replace_uses!(call, ptr) end diff --git a/test/Project.toml b/test/Project.toml index d01c6e6e..330867c5 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -23,3 +23,6 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673" Aqua = "0.8" LLVM_jll = "15,16,18,20" ParallelTestRunner = "2" + +[sources] +GPUCompiler = { path=".." } diff --git a/test/gcn.jl b/test/gcn.jl index 1ec598fe..5e35d580 100644 --- a/test/gcn.jl +++ b/test/gcn.jl @@ -498,7 +498,7 @@ end @check "alloca [32 x i8], align 4, addrspace(5)" @check "addrspacecast" @check_not "julia.gpu.alloca" - GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) end # once optimized the slot is promoted away entirely (result is x + x). diff --git a/test/metal.jl b/test/metal.jl index 02b0b818..47398ec4 100644 --- a/test/metal.jl +++ b/test/metal.jl @@ -1291,7 +1291,7 @@ end @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" @check "alloca [32 x i8], align 4" @check_not "julia.gpu.alloca" - Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) end # once optimized the slot is promoted away entirely (result is x + x). diff --git a/test/native.jl b/test/native.jl index 613780de..15010864 100644 --- a/test/native.jl +++ b/test/native.jl @@ -761,7 +761,7 @@ end @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" @check "alloca [32 x i8], align 4" @check_not "julia.gpu.alloca" - Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) end # once optimized the slot is promoted away entirely (result is x + x). diff --git a/test/ptx.jl b/test/ptx.jl index 3cd35bbc..84f8b753 100644 --- a/test/ptx.jl +++ b/test/ptx.jl @@ -158,7 +158,7 @@ end @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" @check "alloca [32 x i8], align 4" @check_not "julia.gpu.alloca" - PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false) + PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) end # once optimized the slot is promoted away entirely (result is x + x). diff --git a/test/spirv.jl b/test/spirv.jl index 9b9bd247..7c01a118 100644 --- a/test/spirv.jl +++ b/test/spirv.jl @@ -224,7 +224,7 @@ end @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" @check "alloca [32 x i8], align 4" @check_not "julia.gpu.alloca" - SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false) + SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false, dump_module=true) end # once optimized the slot is promoted away entirely (result is x + x). From fdc476561fc5600d1e7612f1cf9f69f997c09c6b Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 4 Jul 2026 23:48:35 +0200 Subject: [PATCH 5/5] Simplify the alloca intrinsic - Encode size and alignment as constant call operands of a single `julia.gpu.alloca` declaration (matching `deferred_codegen`), instead of mangling them into the intrinsic name; `lower_alloca!` now guards with an O(1) name lookup like `lower_debug_level!`. - Replace the hand-rolled bitcast/addrspacecast selection with `pointercast!`, and hoist the IRBuilder out of the per-call loop. - Test the lowering on native (generic path) and GCN (nonzero alloca address space) only, dropping the identical PTX/SPIRV/Metal copies. - Don't export `alloca`; downstream users call it qualified. Co-Authored-By: Claude Fable 5 --- src/irgen.jl | 80 ++++++++++++++++++++++----------------------------- test/metal.jl | 41 -------------------------- test/ptx.jl | 41 -------------------------- test/spirv.jl | 41 -------------------------- 4 files changed, 34 insertions(+), 169 deletions(-) diff --git a/src/irgen.jl b/src/irgen.jl index c391fbdc..169123a6 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -1225,8 +1225,9 @@ end # device code can request a fixed-size, per-workitem stack scratch buffer via # `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of -# `T`. this emits the `julia.gpu.alloca..` intrinsic, which `lower_alloca!` -# (run from `irgen`, before the optimizer) materializes as a real entry-block `alloca`. +# `T`. this emits a call to the `julia.gpu.alloca` intrinsic with the size and alignment +# as constant operands, which `lower_alloca!` (run from `irgen`, before the optimizer) +# materializes as a real entry-block `alloca`. # # this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective: # the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target @@ -1235,15 +1236,16 @@ end # ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca # address space, early enough for the optimizer to promote it. -function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer) - name = "julia.gpu.alloca.$(bytes).$(align)" +function alloca_intr(mod::LLVM.Module, T_ptr::LLVMType) + name = "julia.gpu.alloca" intr = if haskey(functions(mod), name) functions(mod)[name] else - # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call - # must yield a distinct slot and must not be hoisted or CSE'd. - T_ptr = LLVM.supports_typed_pointers(LLVM.context(mod)) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType() - LLVM.Function(mod, name, LLVM.FunctionType(T_ptr)) + # takes the size in bytes and the alignment as constant operands, and returns an + # opaque pointer; intentionally *not* readnone/speculatable, as each call must + # yield a distinct slot and must not be hoisted or CSE'd. + T_i64 = LLVM.Int64Type() + LLVM.Function(mod, name, LLVM.FunctionType(T_ptr, [T_i64, T_i64])) end return intr end @@ -1272,7 +1274,7 @@ function alloca_value(@nospecialize(T), N::Int) mod = LLVM.parent(llvm_f) # get intrinsic - intr = alloca_intr(mod, bytes, align) + intr = alloca_intr(mod, T_ptr) intr_ft = function_type(intr) # generate IR @@ -1280,7 +1282,9 @@ function alloca_value(@nospecialize(T), N::Int) entry = BasicBlock(llvm_f, "entry") position!(builder, entry) - ptr = call!(builder, intr_ft, intr, Value[], "alloca") + args = Value[ConstantInt(LLVM.Int64Type(), bytes), + ConstantInt(LLVM.Int64Type(), align)] + ptr = call!(builder, intr_ft, intr, args, "alloca") ret!(builder, ptr) end @@ -1294,54 +1298,38 @@ end # (an `alloca` of GC-tracked references would be unrooted). intended as a building block for # higher-level scratch abstractions (e.g. KernelAbstractions' `@private`). @inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N) -export alloca -# replace every `julia.gpu.alloca.*` call with an entry-block alloca in the containing function +# replace every `julia.gpu.alloca` call with an entry-block alloca in the containing function function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module) - changed = false - prefix = "julia.gpu.alloca." - - for intr in functions(mod) - fn = LLVM.name(intr) - startswith(fn, prefix) || continue - - bytes, align = parse.(Int, split(fn[length(prefix)+1:end], '.')) - slot_typ = LLVM.ArrayType(LLVM.Int8Type(), bytes) + haskey(functions(mod), "julia.gpu.alloca") || return false + intr = functions(mod)["julia.gpu.alloca"] + @dispose builder=IRBuilder() begin for use in collect(uses(intr)) call = user(use) @assert call isa LLVM.CallInst + bytes, align = convert.(Int, operands(call)[1:2]) f = LLVM.parent(LLVM.parent(call)) - @dispose builder=IRBuilder() begin - # materialize the slot at the top of the entry block so that it is a static - # alloca (promotable, and allocated once rather than per loop iteration). - position!(builder, first(instructions(first(blocks(f))))) - slot = alloca!(builder, slot_typ, "alloca") - alignment!(slot, align) - - # `alloca!` placed the slot in the datalayout's alloca address space; cast back - # to generic (AS 0) to match the `Ptr` `alloca` returns. - ptr = if value_type(slot) == value_type(call) - slot - else - if LLVM.addrspace(value_type(slot)) == LLVM.addrspace(value_type(call)) - bitcast!(builder, slot, value_type(call)) - else - addrspacecast!(builder, slot, value_type(call)) - end - end - replace_uses!(call, ptr) - end + # materialize the slot at the top of the entry block so that it is a static + # alloca (promotable, and allocated once rather than per loop iteration). + position!(builder, first(instructions(first(blocks(f))))) + slot = alloca!(builder, LLVM.ArrayType(LLVM.Int8Type(), bytes), "alloca") + alignment!(slot, align) + + # `alloca!` placed the slot in the datalayout's alloca address space; cast back + # to generic (AS 0) to match the `Ptr` `alloca` returns. + ptr = pointercast!(builder, slot, value_type(call)) + + replace_uses!(call, ptr) erase!(call) - changed = true end - - @assert isempty(uses(intr)) - erase!(intr) end - return changed + @assert isempty(uses(intr)) + erase!(intr) + + return true end # convert kernel state argument from pass-by-value to pass-by-reference diff --git a/test/metal.jl b/test/metal.jl index 47398ec4..c1f8b369 100644 --- a/test/metal.jl +++ b/test/metal.jl @@ -1270,45 +1270,4 @@ end end end -@testset "stack allocation intrinsic" begin - mod = @eval module $(gensym()) - import ..GPUCompiler - - function scratch(x) - p = GPUCompiler.alloca(Float32, Val(8)) - @inbounds unsafe_store!(p, x, 1) - @inbounds unsafe_store!(p, x, 8) - return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) - end - - # zero-element scratch yields a (null) pointer without emitting an alloca - empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) - end - - # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, - # and no `julia.gpu.alloca` call/declaration survives lowering. - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check "alloca [32 x i8], align 4" - @check_not "julia.gpu.alloca" - Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) - end - - # once optimized the slot is promoted away entirely (result is x + x). - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - Metal.code_llvm(mod.scratch, Tuple{Float32}) - end - - # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. - @test @filecheck begin - @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - Metal.code_llvm(mod.empty_scratch, Tuple{}) - end -end - end diff --git a/test/ptx.jl b/test/ptx.jl index 84f8b753..2d93f3d5 100644 --- a/test/ptx.jl +++ b/test/ptx.jl @@ -137,47 +137,6 @@ end @test occursin("call void @julia_", ir) end -@testset "stack allocation intrinsic" begin - mod = @eval module $(gensym()) - import ..GPUCompiler - - function scratch(x) - p = GPUCompiler.alloca(Float32, Val(8)) - @inbounds unsafe_store!(p, x, 1) - @inbounds unsafe_store!(p, x, 8) - return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) - end - - # zero-element scratch yields a (null) pointer without emitting an alloca - empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) - end - - # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, - # and no `julia.gpu.alloca` call/declaration survives lowering. - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check "alloca [32 x i8], align 4" - @check_not "julia.gpu.alloca" - PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true) - end - - # once optimized the slot is promoted away entirely (result is x + x). - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - PTX.code_llvm(mod.scratch, Tuple{Float32}) - end - - # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. - @test @filecheck begin - @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - PTX.code_llvm(mod.empty_scratch, Tuple{}) - end -end - end ############################################################################################ diff --git a/test/spirv.jl b/test/spirv.jl index 7c01a118..b5006194 100644 --- a/test/spirv.jl +++ b/test/spirv.jl @@ -203,45 +203,4 @@ end end end -@testset "stack allocation intrinsic" begin - mod = @eval module $(gensym()) - import ..GPUCompiler - - function scratch(x) - p = GPUCompiler.alloca(Float32, Val(8)) - @inbounds unsafe_store!(p, x, 1) - @inbounds unsafe_store!(p, x, 8) - return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8) - end - - # zero-element scratch yields a (null) pointer without emitting an alloca - empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL) - end - - # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`, - # and no `julia.gpu.alloca` call/declaration survives lowering. - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check "alloca [32 x i8], align 4" - @check_not "julia.gpu.alloca" - SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false, dump_module=true) - end - - # once optimized the slot is promoted away entirely (result is x + x). - @test @filecheck begin - @check_label "define float @{{(julia|j)_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend) - end - - # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca. - @test @filecheck begin - @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}" - @check_not "alloca" - @check_not "julia.gpu.alloca" - SPIRV.code_llvm(mod.empty_scratch, Tuple{}; backend) - end -end - end