Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions src/irgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ function irgen(@nospecialize(job::CompilerJob))
# the job's configured level, so device code can branch on it as a compile-time
# constant that is part of the cache key (unlike reading the `-g` global directly).
lower_debug_level!(job, mod)

# materialize `GPUCompiler.alloca` intrinsics as real entry-block allocas, before the
# optimizer runs so the slots can be promoted (see `lower_alloca!`).
lower_alloca!(job, mod)
end

return mod, compiled, gv_to_value
Expand Down Expand Up @@ -1216,6 +1220,118 @@ function lower_debug_level!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
return true
end


## stack allocation

# device code can request a fixed-size, per-workitem stack scratch buffer via
# `alloca(T, Val(N))`, returning a `Ptr{T}` to uninitialized storage for `N` elements of
# `T`. this emits a call to the `julia.gpu.alloca` intrinsic with the size and alignment
# as constant operands, which `lower_alloca!` (run from `irgen`, before the optimizer)
# materializes as a real entry-block `alloca`.
#
# this exists because emitting an `alloca` directly through `llvmcall` is unsound/ineffective:
# the `Ptr` round-trip through `ptrtoint`/`inttoptr` blocks SROA/mem2reg promotion, the target
# stack address space (e.g. AS 5 on NVPTX/AMDGPU) isn't known at the front-end, and the
# LangRef lifetime of an `alloca` is tied to the (inlined) `llvmcall` wrapper. lowering it
# ourselves lets us place the slot in the kernel entry block, in the datalayout's alloca
# address space, early enough for the optimizer to promote it.

function alloca_intr(mod::LLVM.Module, T_ptr::LLVMType)
name = "julia.gpu.alloca"
intr = if haskey(functions(mod), name)
functions(mod)[name]
else
# takes the size in bytes and the alignment as constant operands, and returns an
# opaque pointer; intentionally *not* readnone/speculatable, as each call must
# yield a distinct slot and must not be hoisted or CSE'd.
T_i64 = LLVM.Int64Type()
LLVM.Function(mod, name, LLVM.FunctionType(T_ptr, [T_i64, T_i64]))
end
return intr
end

# run-time equivalent: emits a call to the alloca intrinsic, returning a `Ptr{T}` to scratch
# storage for `N` elements of `T` (materialized by `lower_alloca!`).
function alloca_value(@nospecialize(T), N::Int)
isbitstype(T) ||
error("GPUCompiler.alloca only supports `isbits` element types, got $T")
N >= 0 || throw(ArgumentError("GPUCompiler.alloca count must be non-negative, got $N"))

bytes = sizeof(T) * N
align = Base.datatype_alignment(T)

# a zero-byte allocation has no storage to point at; hand back a null pointer rather than
# emitting a degenerate 0-element alloca.
if bytes == 0
return :(reinterpret(Ptr{$T}, C_NULL))
end
Comment thread
vchuravy marked this conversation as resolved.

@dispose ctx=Context() begin
T_ptr = LLVM.supports_typed_pointers(ctx) ? LLVM.PointerType(LLVM.Int8Type()) : LLVM.PointerType()

# create function
llvm_f, _ = create_function(T_ptr)
mod = LLVM.parent(llvm_f)

# get intrinsic
intr = alloca_intr(mod, T_ptr)
intr_ft = function_type(intr)

# generate IR
@dispose builder=IRBuilder() begin
entry = BasicBlock(llvm_f, "entry")
position!(builder, entry)

args = Value[ConstantInt(LLVM.Int64Type(), bytes),
ConstantInt(LLVM.Int64Type(), align)]
ptr = call!(builder, intr_ft, intr, args, "alloca")

ret!(builder, ptr)
end

call_function(llvm_f, Ptr{T})
end
end

# device-facing accessor: a `Ptr{T}` to per-workitem stack scratch for `N` elements of `T`.
# the storage is uninitialized and only valid within the calling kernel. `T` must be `isbits`
# (an `alloca` of GC-tracked references would be unrooted). intended as a building block for
# higher-level scratch abstractions (e.g. KernelAbstractions' `@private`).
@inline @generated alloca(::Type{T}, ::Val{N}) where {T,N} = alloca_value(T, N)

# replace every `julia.gpu.alloca` call with an entry-block alloca in the containing function
function lower_alloca!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
haskey(functions(mod), "julia.gpu.alloca") || return false
intr = functions(mod)["julia.gpu.alloca"]

@dispose builder=IRBuilder() begin
for use in collect(uses(intr))
call = user(use)
@assert call isa LLVM.CallInst
bytes, align = convert.(Int, operands(call)[1:2])
f = LLVM.parent(LLVM.parent(call))

# materialize the slot at the top of the entry block so that it is a static
# alloca (promotable, and allocated once rather than per loop iteration).
position!(builder, first(instructions(first(blocks(f)))))
slot = alloca!(builder, LLVM.ArrayType(LLVM.Int8Type(), bytes), "alloca")
alignment!(slot, align)

# `alloca!` placed the slot in the datalayout's alloca address space; cast back
# to generic (AS 0) to match the `Ptr` `alloca` returns.
ptr = pointercast!(builder, slot, value_type(call))

replace_uses!(call, ptr)
erase!(call)
end
end

@assert isempty(uses(intr))
erase!(intr)

return true
end

# convert kernel state argument from pass-by-value to pass-by-reference
#
# the kernel state argument is always passed by value to avoid codegen issues with byval.
Expand Down
3 changes: 3 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673"
Aqua = "0.8"
LLVM_jll = "15,16,18,20"
ParallelTestRunner = "2"

[sources]
GPUCompiler = { path=".." }
41 changes: 41 additions & 0 deletions test/gcn.jl
Original file line number Diff line number Diff line change
Expand Up @@ -476,5 +476,46 @@ end
GCN.code_native(devnull, mod.kernel, Tuple{Float32,Ptr{Float32}})
end

@testset "stack allocation intrinsic" begin
mod = @eval module $(gensym())
import ..GPUCompiler

function scratch(x)
p = GPUCompiler.alloca(Float32, Val(8))
@inbounds unsafe_store!(p, x, 1)
@inbounds unsafe_store!(p, x, 8)
return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
end

# zero-element scratch yields a (null) pointer without emitting an alloca
empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
end

# AMDGPU uses alloca address space 5, so the materialized slot lives in AS 5 and
# `lower_alloca!` emits an `addrspacecast` back to generic (AS 0).
@test @filecheck begin
@check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
@check "alloca [32 x i8], align 4, addrspace(5)"
@check "addrspacecast"
@check_not "julia.gpu.alloca"
GCN.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
end

# once optimized the slot is promoted away entirely (result is x + x).
@test @filecheck begin
@check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
@check_not "julia.gpu.alloca"
GCN.code_llvm(mod.scratch, Tuple{Float32})
end

# a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
@test @filecheck begin
@check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
@check_not "alloca"
@check_not "julia.gpu.alloca"
GCN.code_llvm(mod.empty_scratch, Tuple{})
end
end

end
end # :AMDGPU in LLVM.backends()
41 changes: 41 additions & 0 deletions test/native.jl
Original file line number Diff line number Diff line change
Expand Up @@ -739,3 +739,44 @@ end
@test !occursin("deferred_codegen", ir)
@test occursin("call void @julia_kernel", ir)
end

@testset "stack allocation intrinsic" begin
mod = @eval module $(gensym())
import ..GPUCompiler

function scratch(x)
p = GPUCompiler.alloca(Float32, Val(8))
@inbounds unsafe_store!(p, x, 1)
@inbounds unsafe_store!(p, x, 8)
return @inbounds unsafe_load(p, 1) + unsafe_load(p, 8)
end

# zero-element scratch yields a (null) pointer without emitting an alloca
empty_scratch() = GPUCompiler.alloca(Float32, Val(0)) === reinterpret(Ptr{Float32}, C_NULL)
end

# the intrinsic is materialized as a single entry-block `alloca [8 x f32 = 32 x i8]`,
# and no `julia.gpu.alloca` call/declaration survives lowering.
@test @filecheck begin
@check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
@check "alloca [32 x i8], align 4"
@check_not "julia.gpu.alloca"
Native.code_llvm(mod.scratch, Tuple{Float32}; optimize=false, dump_module=true)
end

# once optimized the slot is promoted away entirely (result is x + x).
@test @filecheck begin
@check_label "define float @{{(julia|j)_scratch_[0-9]+}}"
@check_not "alloca"
@check_not "julia.gpu.alloca"
Native.code_llvm(mod.scratch, Tuple{Float32})
end

# a zero-byte allocation lowers to a null pointer rather than a degenerate alloca.
@test @filecheck begin
@check_label "define {{.*}}@{{(julia|j)_empty_scratch_[0-9]+}}"
@check_not "alloca"
@check_not "julia.gpu.alloca"
Native.code_llvm(mod.empty_scratch, Tuple{})
end
end
Loading