Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce clcall for ccall-like kernel invocation. #228

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ Breaking changes:
objects, have been replaced by `getproperty` overloading on the objects themselves
(e.g., `cl.info(dev, :name)` and `dev[:name]` are now simply `dev.name`).
- The blocking `cl.launch` has been replaced by a nonblocking `cl.call`, while also removing
the `getindex`-overloading shorthand.
the `getindex`-overloading shorthand. However, it's recommended to use the newly-added
`cl.clcall` function, which takes an additional tuple type argument and performs automatic
conversions of arguments to those types. This makes it possible to pass a `CLArray` to an
OpenCL C function expecting Buffer-backed pointers, for example.
- Argument conversion has been removed; the user should make sure Julia arguments passed to
kernels match the OpenCL argument types (i.e., no empty types, 4-element tuples for
a 3-element `float3` arguments).
Expand Down
55 changes: 55 additions & 0 deletions lib/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,61 @@ function call(k::Kernel, args...; global_size=(1,), local_size=nothing,
enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
end

# convert the argument values to match the kernel's signature (specified by the user)
# (this mimics `lower-ccall` in julia-syntax.scm)
@inline @generated function convert_arguments(f::Function, ::Type{tt}, args...) where {tt}
types = tt.parameters

ex = quote end

converted_args = Vector{Symbol}(undef, length(args))
arg_ptrs = Vector{Symbol}(undef, length(args))
for i in 1:length(args)
converted_args[i] = gensym()
arg_ptrs[i] = gensym()
push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))
push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i]))))
end

append!(ex.args, (quote
GC.@preserve $(converted_args...) begin
f($(arg_ptrs...))
end
end).args)

return ex
end

clcall(f::F, types::Tuple, args::Vararg{Any,N}; kwargs...) where {N,F} =
clcall(f, _to_tuple_type(types), args...; kwargs...)

function clcall(k::Kernel, types::Type{T}, args::Vararg{Any,N}; kwargs...) where {T,N}
call_closure = function (pointers::Vararg{Any,N})
call(k, pointers...; kwargs...)
end
convert_arguments(call_closure, types, args...)
end

# From `julia/base/reflection.jl`, adjusted to add specialization on `t`.
function _to_tuple_type(t)
if isa(t, Tuple) || isa(t, AbstractArray) || isa(t, SimpleVector)
t = Tuple{t...}
end
if isa(t, Type) && t <: Tuple
for p in (Base.unwrap_unionall(t)::DataType).parameters
if isa(p, Core.TypeofVararg)
p = Base.unwrapva(p)
end
if !(isa(p, Type) || isa(p, TypeVar))
error("argument tuple type must contain only types")
end
end
else
error("expected tuple type")
end
t
end

function enqueue_task(k::Kernel; wait_for=nothing)
n_evts = 0
evt_ids = C_NULL
Expand Down
4 changes: 4 additions & 0 deletions lib/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ abstract type AbstractMemory <: CLObject end
# ...
# end

# for passing buffers to OpenCL APIs: use the underlying handle
Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemory) = mem.id

# for passing buffers to kernels: keep the buffer, it's handled by `cl.set_arg!`
Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemory) = mem

Base.pointer(mem::AbstractMemory) = mem.id

Base.sizeof(mem::AbstractMemory) = mem.size
Expand Down
6 changes: 5 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ function Base.reshape(A::CLArray{T}, dims::NTuple{N,Int}) where {T,N}
end


## interop with other arrays
## conversions

function CLArray(hostarray::AbstractArray{T,N}; kwargs...) where {T, N}
buf = cl.Buffer(hostarray; kwargs...)
Expand All @@ -89,6 +89,10 @@ function Base.Array(A::CLArray{T,N}) where {T, N}
return hA
end

function Base.cconvert(::Type{Ptr{T}}, A::CLArray{T}) where T
buffer(A)
end


## utilities

Expand Down
7 changes: 4 additions & 3 deletions test/behaviour.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
prg = cl.Program(source=hello_world_kernel) |> cl.build!
kern = cl.Kernel(prg, "hello")

cl.call(kern, buffer(out_arr); global_size=str_len)
cl.clcall(kern, Tuple{Ptr{Cchar}}, out_arr; global_size=str_len)
h = Array(out_arr)

@test hello_world_str == GC.@preserve h unsafe_string(pointer(h))
Expand Down Expand Up @@ -212,7 +212,8 @@ end
R_arr = CLArray{Float32}(undef, 10; device=:w)

global_size = size(X)
cl.call(part3, buffer(X_arr), buffer(Y_arr), buffer(R_arr), buffer(P_arr); global_size, local_size=nothing)
cl.clcall(part3, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Params}},
X_arr, Y_arr, R_arr, P_arr; global_size)

r = Array(R_arr)
@test all(x -> x == 13.5, r)
Expand Down Expand Up @@ -250,7 +251,7 @@ end

P = MutableParams(0.5, 10.0)
P_arr = CLArray{Float32}(undef, 2)
cl.call(part3, buffer(P_arr), P)
cl.clcall(part3, Tuple{Ptr{Float32}, MutableParams}, P_arr, P)

r = Array(P_arr)

Expand Down
13 changes: 8 additions & 5 deletions test/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,17 @@
k = cl.Kernel(p, "test")

# dimensions must be the same size
@test_throws ArgumentError cl.call(k, buffer(d_arr); global_size=(1,), local_size=(1,1))
@test_throws ArgumentError cl.call(k, buffer(d_arr); global_size=(1,1), local_size=(1,))
@test_throws ArgumentError cl.clcall(k, Tuple{Ptr{Float32}}, d_arr;
global_size=(1,), local_size=(1,1))
@test_throws ArgumentError cl.clcall(k, Tuple{Ptr{Float32}}, d_arr;
global_size=(1,1), local_size=(1,))

# dimensions are bounded
max_work_dim = cl.device().max_work_item_dims
bad = tuple([1 for _ in 1:(max_work_dim + 1)])

# calls are asynchronous, but cl.read blocks
cl.call(k, buffer(d_arr))
cl.clcall(k, Tuple{Ptr{Float32}}, d_arr)
@test Array(d_arr) == [2f0]

# enqueue task is an alias for calling
Expand All @@ -130,7 +132,7 @@
structkernel = cl.Kernel(prg, "structest")
out = CLArray{Float32}(undef, 2)
bstruct = (1, Int32(4))
cl.call(structkernel, buffer(out), bstruct)
cl.clcall(structkernel, Tuple{Ptr{Float32}, Tuple{Clong, Cint}}, out, bstruct)
@test Array(out) == [1f0, 4f0]
end

Expand All @@ -153,7 +155,8 @@
# (only on some platforms)
vec3_a = (1f0, 2f0, 3f0, 0f0)
vec3_b = (4f0, 5f0, 6f0, 0f0)
cl.call(vec3kernel, buffer(out), vec3_a, vec3_b)
cl.clcall(vec3kernel, Tuple{Ptr{Float32}, NTuple{4,Float32}, NTuple{4,Float32}},
out, vec3_a, vec3_b)
@test Array(out) == [1f0, 2f0, 3f0, 4f0, 5f0, 6f0]
end
end