Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]Generic Optimizations #79

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ option(PORTFFT_CLANG_TIDY_AUTOFIX "Attempt to fix defects found by clang-tidy" O
option(PORTFFT_LOG_DUMPS "Whether to enable logging of data dumps" OFF)
option(PORTFFT_LOG_TRANSFERS "Whether to enable logging of memory transfers" OFF)
option(PORTFFT_LOG_TRACE "Whether to enable tracing of function calls" OFF)
option(PORTFFT_USE_FAST_TRIG_APPROX "Whether or not to use fast approx for trignometric functions" OFF)
set(PORTFFT_REGISTERS_PER_WI 128 CACHE STRING "How many 32b registers can be allocated per work item on the target device")
set(PORTFFT_SUBGROUP_SIZES 32 CACHE STRING "Comma separated list of subgroup sizes to compile for. The first size supported by the device will be used.")
set(PORTFFT_VEC_LOAD_BYTES 16 CACHE STRING "Number of consecutive bytes each work item should load at once.")
Expand Down Expand Up @@ -95,6 +96,9 @@ endif()
if(${PORTFFT_LOG})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG)
endif()
if(${PORTFFT_USE_FAST_TRIG_APPROX})
target_compile_definitions(portfft INTERFACE PORTFFT_USE_FAST_TRIG_APPROX)
endif()

target_compile_options(portfft INTERFACE -fgpu-inline-threshold=1000000)
target_link_options(portfft INTERFACE -fsycl-device-code-split=per_kernel)
Expand Down
50 changes: 37 additions & 13 deletions src/common/subgroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,26 @@ __attribute__((always_inline)) inline void cross_sg_naive_dft(T& real, T& imag,
T res_imag = 0;

unrolled_loop<0, N, 1>([&](Idx idx_in) __attribute__((always_inline)) {
const T multi_re = twiddle<T>::Re[N][idx_in * idx_out % N];
const T multi_im = [&]() __attribute__((always_inline)) {
if constexpr (Dir == direction::FORWARD) {
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
T theta = (static_cast<T>(-2 * M_PI)) * static_cast<T>(idx_in * idx_out % N) / static_cast<T>(N);
#endif
T multi_re = [&]() {
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
return static_cast<T>(sycl::cos(theta));
#else
return twiddle<T>::Re[N][idx_in * idx_out % N];
#endif
}();
T multi_im = [&]() {
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
return static_cast<T>(sycl::sin(theta));
#else
return twiddle<T>::Im[N][idx_in * idx_out % N];
}
return -twiddle<T>::Im[N][idx_in * idx_out % N];
#endif
}();
if constexpr (Dir == direction::BACKWARD) {
multi_im = -multi_im;
}
();
std::size_t source_wi_id = static_cast<std::size_t>(fft_start + idx_in * Stride);

T cur_real = sycl::select_from_group(sg, real, source_wi_id);
Expand Down Expand Up @@ -181,14 +193,26 @@ __attribute__((always_inline)) inline void cross_sg_cooley_tukey_dft(T& real, T&
// transpose
cross_sg_transpose<N, M, Stride>(real, imag, sg);
// twiddle
const T multi_re = twiddle<T>::Re[N * M][k * n];
const T multi_im = [&]() __attribute__((always_inline)) {
if constexpr (Dir == direction::FORWARD) {
return twiddle<T>::Im[N * M][k * n];
}
return -twiddle<T>::Im[N * M][k * n];
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
T theta = (static_cast<T>(-2 * M_PI) * static_cast<T>(k * n)) / static_cast<T>(N * M);
#endif
T multi_re = [&]() {
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
return static_cast<T>(sycl::cos(theta));
#else
return twiddle<T>::Re[N * M][k * n];
#endif
}();
T multi_im = [&]() {
#ifdef PORTFFT_USE_FAST_TRIG_APPROX
return static_cast<T>(sycl::sin(theta));
#else
return twiddle<T>::Im[N * M][k * n];
#endif
}();
if constexpr (Dir == direction::BACKWARD) {
multi_im = -multi_im;
}
();
detail::multiply_complex(real, imag, multi_re, multi_im, real, imag);
// factor M
cross_sg_dft<Dir, M, N * Stride>(real, imag, sg);
Expand Down
5 changes: 2 additions & 3 deletions src/descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,8 @@ class committed_descriptor {
std::size_t length;
Idx used_sg_size;

dimension_struct(std::vector<kernel_data_struct> kernels, detail::level level, std::size_t length, Idx used_sg_size) : kernels(kernels), level(level),
length(length),
used_sg_size(used_sg_size) {}
dimension_struct(std::vector<kernel_data_struct> kernels, detail::level level, std::size_t length, Idx used_sg_size)
: kernels(kernels), level(level), length(length), used_sg_size(used_sg_size) {}
};

std::vector<dimension_struct> dimensions;
Expand Down