From 0632239fb6a1e3a04f483f90dbea5ea4a7de7a95 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 22 Jun 2026 17:37:19 +0200
Subject: [PATCH 1/5] Add `alloca` intrinsic for per-workitem stack scratch

Introduce `GPUCompiler.alloca(::Type{T}, ::Val{N})::Ptr{T}`, which hands device
code a fixed-size, per-workitem stack scratch buffer for `N` elements of `T`.

This is meant to replace abstractions like KernelAbstractions' `@private`
`MArray`-backed scratchpad with a direct stack allocation. Emitting the `alloca`
through `llvmcall` directly is unsound/ineffective: the `Ptr` round-trip through
`ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target stack address
space (e.g. addrspace 5 on NVPTX/AMDGPU) isn't known at the front end, and the
LangRef lifetime of the `alloca` is tied to the inlined `llvmcall` wrapper.

Instead, the front end emits a `julia.gpu.alloca.<bytes>.<align>` intrinsic that
`lower_alloca!` (run from `irgen`, before the optimizer) materializes as a real
entry-block `alloca` in the datalayout's alloca address space, cast back to
generic. Running before optimization lets the slot be promoted just like the
mutable stack allocations Julia already emits. `T` must be `isbits`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/irgen.jl   | 123 +++++++++++++++++++++++++++++++++++++++++++++++++
 test/native.jl |  41 +++++++++++++++++
 2 files changed, 164 insertions(+)
diff --git a/src/irgen.jl b/src/irgen.jl
index 80dac84d..c8979e0d 100644
--- a/src/irgen.jl
+++ b/src/irgen.jl
@@ -149,6 +149,10 @@ function irgen(@nospecialize(job::CompilerJob))
         # the job's configured level, so device code can branch on it as a compile-time
         # constant that is part of the cache key (unlike reading the `-g` global directly).
         lower_debug_level!(job, mod)
+
+        # materialize `GPUCompiler.alloca` intrinsics as real entry-block allocas, before the
+        # optimizer runs so the slots can be promoted (see `lower_alloca!`).
+        lower_alloca!(job, mod)
     end
 
     return mod, compiled, gv_to_value
@@ -1216,6 +1220,125 @@ function lower_debug_level!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
     return true
 end
 
+
+## stack allocation
+
+# device code can request a fixed-size, per-workitem stack scratch buffer via
+# `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of
+# `T`. this emits the `julia.gpu.alloca.<bytes>.<align>` intrinsic, which `lower_alloca!`
+# (run from `irgen`, before the optimizer) materializes as a real entry-block `alloca`.
+#
+# this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective:
+# the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target
+# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front end, and the
+# LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it
+# ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca
+# address space, early enough for the optimizer to promote it.
+
+function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer)
+    name = "julia.gpu.alloca.$(bytes).$(align)"
+    intr = if haskey(functions(mod), name)
+        functions(mod)[name]
+    else
+        # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call
+        # must yield a distinct slot and must not be hoisted or CSE'd.
+        LLVM.Function(mod, name, LLVM.FunctionType(LLVM.PointerType()))
+    end
+    return intr
+end
+
+# run-time equivalent: emits a call to the alloca intrinsic, returning a `Ptr{T}` to scratch
+# storage for `N` elements of `T` (materialized by `lower_alloca!`).
+function alloca_value(@nospecialize(T), N::Int)
+    isbitstype(T) ||
+        error("GPUCompiler.alloca only supports `isbits` element types, got $T")
+    N >= 0 || throw(ArgumentError("GPUCompiler.alloca count must be non-negative, got $N"))
+
+    bytes = sizeof(T) * N
+    align = Base.datatype_alignment(T)
+
+    # a zero-byte allocation has no storage to point at; hand back a null pointer rather than
+    # emitting a degenerate 0-element alloca.
+    if bytes == 0
+        return :(reinterpret(Ptr{$T}, C_NULL))
+    end
+
+    @dispose ctx=Context() begin
+        T_ptr = LLVM.PointerType()
+
+        # create function
+        llvm_f, _ = create_function(T_ptr)
+        mod = LLVM.parent(llvm_f)
+
+        # get intrinsic
+        intr = alloca_intr(mod, bytes, align)
+        intr_ft = function_type(intr)
+
+        # generate IR
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            ptr = call!(builder, intr_ft, intr, Value[], "alloca")
+
+            ret!(builder, ptr)
+        end
+
+        call_function(llvm_f, Ptr{T})
+    end
+end
+
+# device-facing accessor: a `Ptr{T}` to per-workitem stack scratch for `N` elements of `T`.
+# the storage is uninitialized and only valid within the calling kernel. `T` must be `isbits`
+# (an `alloca` of GC-tracked references would be unrooted). intended as a building block for
+# higher-level scratch abstractions (e.g. KernelAbstractions' `@private`).
+@inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N)
+export alloca
+
+# replace every `julia.gpu.alloca.*` call with an entry-block alloca in the containing function
+function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
+    changed = false
+    prefix = "julia.gpu.alloca."
+
+    for intr in collect(functions(mod))
+        fn = LLVM.name(intr)
+        startswith(fn, prefix) || continue
+
+        bytes, align = parse.(Int, split(fn[length(prefix)+1:end], '.'))
+        slot_typ = LLVM.ArrayType(LLVM.Int8Type(), bytes)
+
+        for use in collect(uses(intr))
+            call = user(use)
+            @assert call isa LLVM.CallInst
+            f = LLVM.parent(LLVM.parent(call))
+
+            @dispose builder=IRBuilder() begin
+                # materialize the slot at the top of the entry block so that it is a static
+                # alloca (promotable, and allocated once rather than per loop iteration).
+                position!(builder, first(instructions(first(blocks(f)))))
+                slot = alloca!(builder, slot_typ, "alloca")
+                alignment!(slot, align)
+
+                # `alloca!` placed the slot in the datalayout's alloca address space; cast back
+                # to generic (AS 0) to match the `Ptr` the front end handed out.
+                ptr = if LLVM.addrspace(value_type(slot)) == 0
+                    slot
+                else
+                    addrspacecast!(builder, slot, LLVM.PointerType())
+                end
+                replace_uses!(call, ptr)
+            end
+            erase!(call)
+            changed = true
+        end
+
+        @assert isempty(uses(intr))
+        erase!(intr)
+    end
+
+    return changed
+end
+
 # convert kernel state argument from pass-by-value to pass-by-reference
 #
 # the kernel state argument is always passed by value to avoid codegen issues with byval.
diff --git a/test/native.jl b/test/native.jl
index 95980c8e..613780de 100644
--- a/test/native.jl
+++ b/test/native.jl
@@ -739,3 +739,44 @@ end
     @test !occursin("deferred_codegen", ir)
     @test occursin("call void @julia_kernel", ir)
 end
+
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # the intrinsic is materialized as a single entry-block `alloca [8 x f32 = 32 x i8]`,
+    # and no `julia.gpu.alloca` call/declaration survives lowering.
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end

From e13a8c5b0c49e23769ea2aae308cfac1db6b7ddc Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 23 Jun 2026 09:21:43 +0200
Subject: [PATCH 2/5] cleanup

---
 src/irgen.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/irgen.jl b/src/irgen.jl
index c8979e0d..78864221 100644
--- a/src/irgen.jl
+++ b/src/irgen.jl
@@ -1230,7 +1230,7 @@ end
 #
 # this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective:
 # the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target
-# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front end, and the
+# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front-end, and the
 # LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it
 # ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca
 # address space, early enough for the optimizer to promote it.
@@ -1300,7 +1300,7 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
     changed = false
     prefix = "julia.gpu.alloca."
 
-    for intr in collect(functions(mod))
+    for intr in functions(mod)
         fn = LLVM.name(intr)
         startswith(fn, prefix) || continue
 
@@ -1320,7 +1320,7 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
                 alignment!(slot, align)
 
                 # `alloca!` placed the slot in the datalayout's alloca address space; cast back
-                # to generic (AS 0) to match the `Ptr` the front end handed out.
+                # to generic (AS 0) to match the `Ptr` `alloca` returns.
                 ptr = if LLVM.addrspace(value_type(slot)) == 0
                     slot
                 else

From 8563d150efd97bb4a8d121adc62c5bc7bbffb9df Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 30 Jun 2026 13:21:03 +0200
Subject: [PATCH 3/5] Add stack allocation intrinsic tests for PTX, GCN, SPIRV,
 and Metal

Test that  lowers correctly for all GPU targets
---
 test/gcn.jl   | 41 +++++++++++++++++++++++++++++++++++++++++
 test/metal.jl | 41 +++++++++++++++++++++++++++++++++++++++++
 test/ptx.jl   | 41 +++++++++++++++++++++++++++++++++++++++++
 test/spirv.jl | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+)

diff --git a/test/gcn.jl b/test/gcn.jl
index 99dae742..1ec598fe 100644
--- a/test/gcn.jl
+++ b/test/gcn.jl
@@ -476,5 +476,46 @@ end
     GCN.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}})
 end
 
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # AMDGPU uses alloca address space 5, so the materialized slot lives in AS 5 and
+    # `lower_alloca!` emits an `addrspacecast` back to generic (AS 0).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4, addrspace(5)"
+        @check "addrspacecast"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end
+
 end
 end # :AMDGPU in LLVM.backends()
diff --git a/test/metal.jl b/test/metal.jl
index c1f8b369..02b0b818 100644
--- a/test/metal.jl
+++ b/test/metal.jl
@@ -1270,4 +1270,45 @@ end
     end
 end
 
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
+    # and no `julia.gpu.alloca` call/declaration survives lowering.
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4"
+        @check_not "julia.gpu.alloca"
+        Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Metal.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Metal.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end
+
 end
diff --git a/test/ptx.jl b/test/ptx.jl
index 2d93f3d5..3cd35bbc 100644
--- a/test/ptx.jl
+++ b/test/ptx.jl
@@ -137,6 +137,47 @@ end
     @test occursin("call void @julia_", ir)
 end
 
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
+    # and no `julia.gpu.alloca` call/declaration survives lowering.
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4"
+        @check_not "julia.gpu.alloca"
+        PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        PTX.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        PTX.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end
+
 end
 
 ############################################################################################
diff --git a/test/spirv.jl b/test/spirv.jl
index b5006194..9b9bd247 100644
--- a/test/spirv.jl
+++ b/test/spirv.jl
@@ -203,4 +203,45 @@ end
     end
 end
 
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
+    # and no `julia.gpu.alloca` call/declaration survives lowering.
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4"
+        @check_not "julia.gpu.alloca"
+        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend)
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        SPIRV.code_llvm(mod.empty_scratch, Tuple{}; backend)
+    end
+end
+
 end

From 699d04c2d7f7c041781c3df9cf68cd3f72edf134 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 30 Jun 2026 13:57:06 +0200
Subject: [PATCH 4/5] Fix alloca intrinsic under typed pointers and older Julia
 versions

1. On Julia 1.10/1.11, LLVM runs in typed-pointers mode. PointerType() creates an opaque pointer in LLVM 15, causing assembly parsing errors on llvmcall since opaque pointers are disabled by default. Fix this by using PointerType(Int8Type()) when supports_typed_pointers() is true.
2. In lower_alloca!, if the pointer types or address spaces differ (which is common under typed pointers where the slot is [32 x i8]* but the call returns i8*), dynamically handle casting using bitcast! or addrspacecast! as appropriate.
3. On Julia 1.10/1.11, under optimize=false, the compiler compiles the llvmcall into a non-inlined helper function. Fix test checking by passing dump_module=true to code_llvm in the unoptimized checks to allow matching inside helper functions.
4. Dev the package in test/Project.toml so that Pkg loads the local repository package under Julia 1.10.
---
 src/irgen.jl      | 13 +++++++++----
 test/Project.toml |  3 +++
 test/gcn.jl       |  2 +-
 test/metal.jl     |  2 +-
 test/native.jl    |  2 +-
 test/ptx.jl       |  2 +-
 test/spirv.jl     |  2 +-
 7 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/irgen.jl b/src/irgen.jl
index 78864221..c391fbdc 100644
--- a/src/irgen.jl
+++ b/src/irgen.jl
@@ -1242,7 +1242,8 @@ function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer)
     else
         # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call
         # must yield a distinct slot and must not be hoisted or CSE'd.
-        LLVM.Function(mod, name, LLVM.FunctionType(LLVM.PointerType()))
+        T_ptr = LLVM.supports_typed_pointers(LLVM.context(mod)) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType()
+        LLVM.Function(mod, name, LLVM.FunctionType(T_ptr))
     end
     return intr
 end
@@ -1264,7 +1265,7 @@ function alloca_value(@nospecialize(T), N::Int)
     end
 
     @dispose ctx=Context() begin
-        T_ptr = LLVM.PointerType()
+        T_ptr = LLVM.supports_typed_pointers(ctx) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType()
 
         # create function
         llvm_f, _ = create_function(T_ptr)
@@ -1321,10 +1322,14 @@ function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
 
                 # `alloca!` placed the slot in the datalayout's alloca address space; cast back
                 # to generic (AS 0) to match the `Ptr` `alloca` returns.
-                ptr = if LLVM.addrspace(value_type(slot)) == 0
+                ptr = if value_type(slot) == value_type(call)
                     slot
                 else
-                    addrspacecast!(builder, slot, LLVM.PointerType())
+                    if LLVM.addrspace(value_type(slot)) == LLVM.addrspace(value_type(call))
+                        bitcast!(builder, slot, value_type(call))
+                    else
+                        addrspacecast!(builder, slot, value_type(call))
+                    end
                 end
                 replace_uses!(call, ptr)
             end
diff --git a/test/Project.toml b/test/Project.toml
index d01c6e6e..330867c5 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -23,3 +23,6 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673"
 Aqua = "0.8"
 LLVM_jll = "15,16,18,20"
 ParallelTestRunner = "2"
+
+[sources]
+GPUCompiler = { path=".." }
diff --git a/test/gcn.jl b/test/gcn.jl
index 1ec598fe..5e35d580 100644
--- a/test/gcn.jl
+++ b/test/gcn.jl
@@ -498,7 +498,7 @@ end
         @check "alloca [32 x i8], align 4, addrspace(5)"
         @check "addrspacecast"
         @check_not "julia.gpu.alloca"
-        GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+        GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
     end
 
     # once optimized the slot is promoted away entirely (result is x + x).
diff --git a/test/metal.jl b/test/metal.jl
index 02b0b818..47398ec4 100644
--- a/test/metal.jl
+++ b/test/metal.jl
@@ -1291,7 +1291,7 @@ end
         @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
         @check "alloca [32 x i8], align 4"
         @check_not "julia.gpu.alloca"
-        Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+        Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
     end
 
     # once optimized the slot is promoted away entirely (result is x + x).
diff --git a/test/native.jl b/test/native.jl
index 613780de..15010864 100644
--- a/test/native.jl
+++ b/test/native.jl
@@ -761,7 +761,7 @@ end
         @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
         @check "alloca [32 x i8], align 4"
         @check_not "julia.gpu.alloca"
-        Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+        Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
     end
 
     # once optimized the slot is promoted away entirely (result is x + x).
diff --git a/test/ptx.jl b/test/ptx.jl
index 3cd35bbc..84f8b753 100644
--- a/test/ptx.jl
+++ b/test/ptx.jl
@@ -158,7 +158,7 @@ end
         @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
         @check "alloca [32 x i8], align 4"
         @check_not "julia.gpu.alloca"
-        PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false)
+        PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
     end
 
     # once optimized the slot is promoted away entirely (result is x + x).
diff --git a/test/spirv.jl b/test/spirv.jl
index 9b9bd247..7c01a118 100644
--- a/test/spirv.jl
+++ b/test/spirv.jl
@@ -224,7 +224,7 @@ end
         @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
         @check "alloca [32 x i8], align 4"
         @check_not "julia.gpu.alloca"
-        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false)
+        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false, dump_module=true)
     end
 
     # once optimized the slot is promoted away entirely (result is x + x).

From fdc476561fc5600d1e7612f1cf9f69f997c09c6b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sat, 4 Jul 2026 23:48:35 +0200
Subject: [PATCH 5/5] Simplify the alloca intrinsic

- Encode size and alignment as constant call operands of a single
  `julia.gpu.alloca` declaration (matching `deferred_codegen`), instead
  of mangling them into the intrinsic name; `lower_alloca!` now guards
  with an O(1) name lookup like `lower_debug_level!`.
- Replace the hand-rolled bitcast/addrspacecast selection with
  `pointercast!`, and hoist the IRBuilder out of the per-call loop.
- Test the lowering on native (generic path) and GCN (nonzero alloca
  address space) only, dropping the identical PTX/SPIRV/Metal copies.
- Don't export `alloca`; downstream users call it qualified.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/irgen.jl  | 80 ++++++++++++++++++++++-----------------------------
 test/metal.jl | 41 --------------------------
 test/ptx.jl   | 41 --------------------------
 test/spirv.jl | 41 --------------------------
 4 files changed, 34 insertions(+), 169 deletions(-)

diff --git a/src/irgen.jl b/src/irgen.jl
index c391fbdc..169123a6 100644
--- a/src/irgen.jl
+++ b/src/irgen.jl
@@ -1225,8 +1225,9 @@ end
 
 # device code can request a fixed-size, per-workitem stack scratch buffer via
 # `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of
-# `T`. this emits the `julia.gpu.alloca.<bytes>.<align>` intrinsic, which `lower_alloca!`
-# (run from `irgen`, before the optimizer) materializes as a real entry-block `alloca`.
+# `T`. this emits a call to the `julia.gpu.alloca` intrinsic with the size and alignment
+# as constant operands, which `lower_alloca!` (run from `irgen`, before the optimizer)
+# materializes as a real entry-block `alloca`.
 #
 # this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective:
 # the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target
@@ -1235,15 +1236,16 @@ end
 # ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca
 # address space, early enough for the optimizer to promote it.
 
-function alloca_intr(mod::LLVM.Module, bytes::Integer, align::Integer)
-    name = "julia.gpu.alloca.$(bytes).$(align)"
+function alloca_intr(mod::LLVM.Module, T_ptr::LLVMType)
+    name = "julia.gpu.alloca"
     intr = if haskey(functions(mod), name)
         functions(mod)[name]
     else
-        # returns an opaque pointer; intentionally *not* readnone/speculatable, as each call
-        # must yield a distinct slot and must not be hoisted or CSE'd.
-        T_ptr = LLVM.supports_typed_pointers(LLVM.context(mod)) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType()
-        LLVM.Function(mod, name, LLVM.FunctionType(T_ptr))
+        # takes the size in bytes and the alignment as constant operands, and returns an
+        # opaque pointer; intentionally *not* readnone/speculatable, as each call must
+        # yield a distinct slot and must not be hoisted or CSE'd.
+        T_i64 = LLVM.Int64Type()
+        LLVM.Function(mod, name, LLVM.FunctionType(T_ptr, [T_i64, T_i64]))
     end
     return intr
 end
@@ -1272,7 +1274,7 @@ function alloca_value(@nospecialize(T), N::Int)
         mod = LLVM.parent(llvm_f)
 
         # get intrinsic
-        intr = alloca_intr(mod, bytes, align)
+        intr = alloca_intr(mod, T_ptr)
         intr_ft = function_type(intr)
 
         # generate IR
@@ -1280,7 +1282,9 @@ function alloca_value(@nospecialize(T), N::Int)
             entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
-            ptr = call!(builder, intr_ft, intr, Value[], "alloca")
+            args = Value[ConstantInt(LLVM.Int64Type(), bytes),
+                         ConstantInt(LLVM.Int64Type(), align)]
+            ptr = call!(builder, intr_ft, intr, args, "alloca")
 
             ret!(builder, ptr)
         end
@@ -1294,54 +1298,38 @@ end
 # (an `alloca` of GC-tracked references would be unrooted). intended as a building block for
 # higher-level scratch abstractions (e.g. KernelAbstractions' `@private`).
 @inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N)
-export alloca
 
-# replace every `julia.gpu.alloca.*` call with an entry-block alloca in the containing function
+# replace every `julia.gpu.alloca` call with an entry-block alloca in the containing function
 function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
-    changed = false
-    prefix = "julia.gpu.alloca."
-
-    for intr in functions(mod)
-        fn = LLVM.name(intr)
-        startswith(fn, prefix) || continue
-
-        bytes, align = parse.(Int, split(fn[length(prefix)+1:end], '.'))
-        slot_typ = LLVM.ArrayType(LLVM.Int8Type(), bytes)
+    haskey(functions(mod), "julia.gpu.alloca") || return false
+    intr = functions(mod)["julia.gpu.alloca"]
 
+    @dispose builder=IRBuilder() begin
         for use in collect(uses(intr))
             call = user(use)
             @assert call isa LLVM.CallInst
+            bytes, align = convert.(Int, operands(call)[1:2])
             f = LLVM.parent(LLVM.parent(call))
 
-            @dispose builder=IRBuilder() begin
-                # materialize the slot at the top of the entry block so that it is a static
-                # alloca (promotable, and allocated once rather than per loop iteration).
-                position!(builder, first(instructions(first(blocks(f)))))
-                slot = alloca!(builder, slot_typ, "alloca")
-                alignment!(slot, align)
-
-                # `alloca!` placed the slot in the datalayout's alloca address space; cast back
-                # to generic (AS 0) to match the `Ptr` `alloca` returns.
-                ptr = if value_type(slot) == value_type(call)
-                    slot
-                else
-                    if LLVM.addrspace(value_type(slot)) == LLVM.addrspace(value_type(call))
-                        bitcast!(builder, slot, value_type(call))
-                    else
-                        addrspacecast!(builder, slot, value_type(call))
-                    end
-                end
-                replace_uses!(call, ptr)
-            end
+            # materialize the slot at the top of the entry block so that it is a static
+            # alloca (promotable, and allocated once rather than per loop iteration).
+            position!(builder, first(instructions(first(blocks(f)))))
+            slot = alloca!(builder, LLVM.ArrayType(LLVM.Int8Type(), bytes), "alloca")
+            alignment!(slot, align)
+
+            # `alloca!` placed the slot in the datalayout's alloca address space; cast back
+            # to generic (AS 0) to match the `Ptr` `alloca` returns.
+            ptr = pointercast!(builder, slot, value_type(call))
+
+            replace_uses!(call, ptr)
             erase!(call)
-            changed = true
         end
-
-        @assert isempty(uses(intr))
-        erase!(intr)
     end
 
-    return changed
+    @assert isempty(uses(intr))
+    erase!(intr)
+
+    return true
 end
 
 # convert kernel state argument from pass-by-value to pass-by-reference
diff --git a/test/metal.jl b/test/metal.jl
index 47398ec4..c1f8b369 100644
--- a/test/metal.jl
+++ b/test/metal.jl
@@ -1270,45 +1270,4 @@ end
     end
 end
 
-@testset "stack allocation intrinsic" begin
-    mod = @eval module $(gensym())
-        import ..GPUCompiler
-
-        function scratch(x)
-            p = GPUCompiler.alloca(Float32, Val(8))
-            @inbounds unsafe_store!(p, x, 1)
-            @inbounds unsafe_store!(p, x, 8)
-            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
-        end
-
-        # zero-element scratch yields a (null) pointer without emitting an alloca
-        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
-    end
-
-    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
-    # and no `julia.gpu.alloca` call/declaration survives lowering.
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check "alloca [32 x i8], align 4"
-        @check_not "julia.gpu.alloca"
-        Metal.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
-    end
-
-    # once optimized the slot is promoted away entirely (result is x + x).
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        Metal.code_llvm(mod.scratch, Tuple{Float32})
-    end
-
-    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
-    @test @filecheck begin
-        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        Metal.code_llvm(mod.empty_scratch, Tuple{})
-    end
-end
-
 end
diff --git a/test/ptx.jl b/test/ptx.jl
index 84f8b753..2d93f3d5 100644
--- a/test/ptx.jl
+++ b/test/ptx.jl
@@ -137,47 +137,6 @@ end
     @test occursin("call void @julia_", ir)
 end
 
-@testset "stack allocation intrinsic" begin
-    mod = @eval module $(gensym())
-        import ..GPUCompiler
-
-        function scratch(x)
-            p = GPUCompiler.alloca(Float32, Val(8))
-            @inbounds unsafe_store!(p, x, 1)
-            @inbounds unsafe_store!(p, x, 8)
-            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
-        end
-
-        # zero-element scratch yields a (null) pointer without emitting an alloca
-        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
-    end
-
-    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
-    # and no `julia.gpu.alloca` call/declaration survives lowering.
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check "alloca [32 x i8], align 4"
-        @check_not "julia.gpu.alloca"
-        PTX.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
-    end
-
-    # once optimized the slot is promoted away entirely (result is x + x).
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        PTX.code_llvm(mod.scratch, Tuple{Float32})
-    end
-
-    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
-    @test @filecheck begin
-        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        PTX.code_llvm(mod.empty_scratch, Tuple{})
-    end
-end
-
 end
 
 ############################################################################################
diff --git a/test/spirv.jl b/test/spirv.jl
index 7c01a118..b5006194 100644
--- a/test/spirv.jl
+++ b/test/spirv.jl
@@ -203,45 +203,4 @@ end
     end
 end
 
-@testset "stack allocation intrinsic" begin
-    mod = @eval module $(gensym())
-        import ..GPUCompiler
-
-        function scratch(x)
-            p = GPUCompiler.alloca(Float32, Val(8))
-            @inbounds unsafe_store!(p, x, 1)
-            @inbounds unsafe_store!(p, x, 8)
-            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
-        end
-
-        # zero-element scratch yields a (null) pointer without emitting an alloca
-        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
-    end
-
-    # the intrinsic is materialized as a single entry-block `alloca [32 x i8]`,
-    # and no `julia.gpu.alloca` call/declaration survives lowering.
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check "alloca [32 x i8], align 4"
-        @check_not "julia.gpu.alloca"
-        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend, optimize=false, dump_module=true)
-    end
-
-    # once optimized the slot is promoted away entirely (result is x + x).
-    @test @filecheck begin
-        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        SPIRV.code_llvm(mod.scratch, Tuple{Float32}; backend)
-    end
-
-    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
-    @test @filecheck begin
-        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
-        @check_not "alloca"
-        @check_not "julia.gpu.alloca"
-        SPIRV.code_llvm(mod.empty_scratch, Tuple{}; backend)
-    end
-end
-
 end