JuliaGPU · vchuravy · Jun 22, 2026 · Jun 23, 2026 · Jun 30, 2026 · Jun 30, 2026
diff --git a/src/irgen.jl b/src/irgen.jl
@@ -149,6 +149,10 @@ function irgen(@nospecialize(job::CompilerJob))
         # the job's configured level, so device code can branch on it as a compile-time
         # constant that is part of the cache key (unlike reading the `-g` global directly).
         lower_debug_level!(job, mod)
+
+        # materialize `GPUCompiler.alloca` intrinsics as real entry-block allocas, before the
+        # optimizer runs so the slots can be promoted (see `lower_alloca!`).
+        lower_alloca!(job, mod)
     end
 
     return mod, compiled, gv_to_value
@@ -1216,6 +1220,118 @@ function lower_debug_level!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
     return true
 end
 
+
+## stack allocation
+
+# device code can request a fixed-size, per-workitem stack scratch buffer via
+# `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of
+# `T`. this emits a call to the `julia.gpu.alloca` intrinsic with the size and alignment
+# as constant operands, which `lower_alloca!` (run from `irgen`, before the optimizer)
+# materializes as a real entry-block `alloca`.
+#
+# this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective:
+# the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target
+# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front-end, and the
+# LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it
+# ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca
+# address space, early enough for the optimizer to promote it.
+
+function alloca_intr(mod::LLVM.Module, T_ptr::LLVMType)
+    name = "julia.gpu.alloca"
+    intr = if haskey(functions(mod), name)
+        functions(mod)[name]
+    else
+        # takes the size in bytes and the alignment as constant operands, and returns an
+        # opaque pointer; intentionally *not* readnone/speculatable, as each call must
+        # yield a distinct slot and must not be hoisted or CSE'd.
+        T_i64 = LLVM.Int64Type()
+        LLVM.Function(mod, name, LLVM.FunctionType(T_ptr, [T_i64, T_i64]))
+    end
+    return intr
+end
+
+# run-time equivalent: emits a call to the alloca intrinsic, returning a `Ptr{T}` to scratch
+# storage for `N` elements of `T` (materialized by `lower_alloca!`).
+function alloca_value(@nospecialize(T), N::Int)
+    isbitstype(T) ||
+        error("GPUCompiler.alloca only supports `isbits` element types, got $T")
+    N >= 0 || throw(ArgumentError("GPUCompiler.alloca count must be non-negative, got $N"))
+
+    bytes = sizeof(T) * N
+    align = Base.datatype_alignment(T)
+
+    # a zero-byte allocation has no storage to point at; hand back a null pointer rather than
+    # emitting a degenerate 0-element alloca.
+    if bytes == 0
+        return :(reinterpret(Ptr{$T}, C_NULL))
+    end
+
+    @dispose ctx=Context() begin
+        T_ptr = LLVM.supports_typed_pointers(ctx) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType()
+
+        # create function
+        llvm_f, _ = create_function(T_ptr)
+        mod = LLVM.parent(llvm_f)
+
+        # get intrinsic
+        intr = alloca_intr(mod, T_ptr)
+        intr_ft = function_type(intr)
+
+        # generate IR
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
+            position!(builder, entry)
+
+            args = Value[ConstantInt(LLVM.Int64Type(), bytes),
+                         ConstantInt(LLVM.Int64Type(), align)]
+            ptr = call!(builder, intr_ft, intr, args, "alloca")
+
+            ret!(builder, ptr)
+        end
+
+        call_function(llvm_f, Ptr{T})
+    end
+end
+
+# device-facing accessor: a `Ptr{T}` to per-workitem stack scratch for `N` elements of `T`.
+# the storage is uninitialized and only valid within the calling kernel. `T` must be `isbits`
+# (an `alloca` of GC-tracked references would be unrooted). intended as a building block for
+# higher-level scratch abstractions (e.g. KernelAbstractions' `@private`).
+@inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N)
+
+# replace every `julia.gpu.alloca` call with an entry-block alloca in the containing function
+function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
+    haskey(functions(mod), "julia.gpu.alloca") || return false
+    intr = functions(mod)["julia.gpu.alloca"]
+
+    @dispose builder=IRBuilder() begin
+        for use in collect(uses(intr))
+            call = user(use)
+            @assert call isa LLVM.CallInst
+            bytes, align = convert.(Int, operands(call)[1:2])
+            f = LLVM.parent(LLVM.parent(call))
+
+            # materialize the slot at the top of the entry block so that it is a static
+            # alloca (promotable, and allocated once rather than per loop iteration).
+            position!(builder, first(instructions(first(blocks(f)))))
+            slot = alloca!(builder, LLVM.ArrayType(LLVM.Int8Type(), bytes), "alloca")
+            alignment!(slot, align)
+
+            # `alloca!` placed the slot in the datalayout's alloca address space; cast back
+            # to generic (AS 0) to match the `Ptr` `alloca` returns.
+            ptr = pointercast!(builder, slot, value_type(call))
+
+            replace_uses!(call, ptr)
+            erase!(call)
+        end
+    end
+
+    @assert isempty(uses(intr))
+    erase!(intr)
+
+    return true
+end
+
 # convert kernel state argument from pass-by-value to pass-by-reference
 #
 # the kernel state argument is always passed by value to avoid codegen issues with byval.

diff --git a/test/Project.toml b/test/Project.toml
@@ -23,3 +23,6 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673"
 Aqua = "0.8"
 LLVM_jll = "15,16,18,20"
 ParallelTestRunner = "2"
+
+[sources]
+GPUCompiler = { path=".." }
diff --git a/test/gcn.jl b/test/gcn.jl
@@ -476,5 +476,46 @@ end
     GCN.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}})
 end
 
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # AMDGPU uses alloca address space 5, so the materialized slot lives in AS 5 and
+    # `lower_alloca!` emits an `addrspacecast` back to generic (AS 0).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4, addrspace(5)"
+        @check "addrspacecast"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        GCN.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end
+
 end
 end # :AMDGPU in LLVM.backends()
diff --git a/test/native.jl b/test/native.jl
@@ -739,3 +739,44 @@ end
     @test !occursin("deferred_codegen", ir)
     @test occursin("call void @julia_kernel", ir)
 end
+
+@testset "stack allocation intrinsic" begin
+    mod = @eval module $(gensym())
+        import ..GPUCompiler
+
+        function scratch(x)
+            p = GPUCompiler.alloca(Float32, Val(8))
+            @inbounds unsafe_store!(p, x, 1)
+            @inbounds unsafe_store!(p, x, 8)
+            return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
+        end
+
+        # zero-element scratch yields a (null) pointer without emitting an alloca
+        empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
+    end
+
+    # the intrinsic is materialized as a single entry-block `alloca [8 x f32 = 32 x i8]`,
+    # and no `julia.gpu.alloca` call/declaration survives lowering.
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check "alloca [32 x i8], align 4"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
+    end
+
+    # once optimized the slot is promoted away entirely (result is x + x).
+    @test @filecheck begin
+        @check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.scratch, Tuple{Float32})
+    end
+
+    # a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
+    @test @filecheck begin
+        @check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
+        @check_not "alloca"
+        @check_not "julia.gpu.alloca"
+        Native.code_llvm(mod.empty_scratch, Tuple{})
+    end
+end