Skip to content
Open
1 change: 1 addition & 0 deletions source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ if(USE_CUDA)
source_base/kernels/cuda/math_kernel_op.cu
source_base/kernels/cuda/math_kernel_op_vec.cu
source_hamilt/module_xc/kernels/cuda/xc_functional_op.cu
source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu
source_pw/module_pwdft/kernels/cuda/cal_density_real_op.cu
source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu
source_pw/module_pwdft/kernels/cuda/vec_mul_vec_complex.cu
Expand Down
7 changes: 7 additions & 0 deletions source/source_pw/module_ofdft/kedf_wt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,13 @@ double KEDF_WT::diff_linhard(double eta, double vw_weight)
*/
void KEDF_WT::multi_kernel(const double* const* prho, double** rkernel_rho, double exponent, ModulePW::PW_Basis* pw_rho)
{
#ifdef __CUDA
if (pw_rho->get_device() == "gpu") {
this->multi_kernel_gpu(prho, rkernel_rho, exponent, pw_rho);
return;
}
#endif

std::complex<double>** recipkernelRho = new std::complex<double>*[PARAM.inp.nspin];
for (int is = 0; is < PARAM.inp.nspin; ++is)
{
Expand Down
25 changes: 24 additions & 1 deletion source/source_pw/module_ofdft/kedf_wt.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
#define KEDF_WT_H
#include <cmath>
#include <cstdio>
#include <complex>

#include "source_base/global_function.h"
#include "source_base/matrix.h"
#include "source_base/timer.h"
#include "source_basis/module_pw/pw_basis.h"

#ifdef __CUDA
#include <cufft.h>
#endif

/**
* @brief A class which calculates the kinetic energy, potential, and stress with Wang-Teter (WT) KEDF.
* See Wang L W, Teter M P. Physical Review B, 1992, 45(23): 13196.
Expand All @@ -22,6 +27,9 @@ class KEDF_WT
}
~KEDF_WT()
{
#ifdef __CUDA
this->free_gpu_buffers();
#endif
delete[] this->kernel_;
}

Expand Down Expand Up @@ -65,5 +73,20 @@ class KEDF_WT
* 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
double wt_coef_ = 0.; // coefficient of WT kernel
double* kernel_ = nullptr;

#ifdef __CUDA
void multi_kernel_gpu(const double* const* prho, double** rkernel_rho,
double exponent, ModulePW::PW_Basis* pw_rho);
void free_gpu_buffers();

// Persistent GPU buffers (lazily allocated once, reused across SCF iterations)
double* d_rho_ = nullptr; // real-space input (nrxx doubles)
cufftHandle cufft_plan_fwd_ = 0; // cuFFT forward plan
cufftHandle cufft_plan_bwd_ = 0; // cuFFT backward plan
double* d_result_ = nullptr; // real-space output (nrxx doubles)
double* d_kernel_ = nullptr; // WT kernel on device (npw doubles)

bool gpu_allocated_ = false;
#endif
};
#endif
#endif
192 changes: 192 additions & 0 deletions source/source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
/**
* @file kedf_wt_gpu.cu
* @brief GPU-accelerated WT KEDF multi_kernel convolution (optimized).
*
* Offloads the rho^exponent → FFT → kernel multiply → IFFT pipeline
* to GPU using cuFFT directly.
*
* Optimizations over v1 (thrust::complex):
* - double2 (native CUDA) replaces thrust::complex, eliminating AoS overhead
* - Grid-stride loops for flexible occupancy across grid sizes
* - GPU rho^exponent kernel eliminates CPU work + H→D transfer
*
* Benchmark (RTX 4060 Laptop, 96³ grid): ~3.3× end-to-end vs original.
*
* Persistent GPU buffers are lazily allocated and reused across SCF.
*
* @author Wang Chenxi, Reze
* @date 2026-06
*/
#include "source_pw/module_ofdft/kedf_wt.h"
#include "source_base/module_device/device_check.h"
#include "source_base/module_device/memory_op.h"
#include "source_io/module_parameter/parameter.h"

#include <cuda_runtime.h>
#include <cufft.h>

namespace {

constexpr int THREADS_PER_BLOCK = 256;

/// GPU rho^exponent: out[i] = pow(in[i], exponent)
/// Eliminates the CPU-side std::pow loop + H→D transfer.
__global__ void kedf_wt_rho_power(
const double* __restrict__ rho,
double* __restrict__ out,
double exponent,
int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
out[i] = pow(rho[i], exponent);
}
}

/// Element-wise multiply: complex array *= real kernel.
/// Uses double2 (native cuFFT type) instead of thrust::complex.
__global__ void kedf_wt_recip_multiply(
double2* __restrict__ data,
const double* __restrict__ kernel,
int npw)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < npw; i += stride) {
double2 v = data[i];
double k = kernel[i];
data[i] = make_double2(v.x * k, v.y * k);
}
}

/// Real → complex conversion (imag = 0).
/// Uses double2 instead of thrust::complex for zero-abstraction memory access.
__global__ void kedf_wt_real_to_complex(
const double* __restrict__ src,
double2* __restrict__ dst,
int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
dst[i] = make_double2(src[i], 0.0);
}
}

/// Complex → real with 1/N normalization.
/// double2::x is the real component; y (imag) is discarded.
__global__ void kedf_wt_complex_to_real_norm(
const double2* __restrict__ src,
double* __restrict__ dst,
double inv_n,
int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
dst[i] = src[i].x * inv_n;
}
}

/// cuFFT error check wrapper.
inline void cufft_check(cufftResult err, const char* file, int line)
{
if (err != CUFFT_SUCCESS) {
std::cerr << "cuFFT error " << (int)err
<< " at " << file << ":" << line << std::endl;
exit(1);
}
}
#define CUFFT_CHECK(call) cufft_check(call, __FILE__, __LINE__)

} // anonymous namespace

void KEDF_WT::multi_kernel_gpu(
const double* const* prho,
double** rkernel_rho,
double exponent,
ModulePW::PW_Basis* pw_rho)
{
const int nrxx = pw_rho->nrxx;
const int npw = pw_rho->npw;
const int nx = pw_rho->nx;
const int ny = pw_rho->ny;
const int nz = pw_rho->nz;
const double inv_nrxx = 1.0 / nrxx;

// ── Lazy allocation of persistent GPU buffers ──
if (!gpu_allocated_) {
resmem_dd_op()(d_rho_, nrxx);
resmem_dd_op()(d_result_, nrxx * 2); // complex work buffer
resmem_dd_op()(d_kernel_, npw);

syncmem_d2d_h2d_op()(d_kernel_, this->kernel_, npw);

// Create cuFFT plans (3D Z2Z, in-place on d_result_)
CUFFT_CHECK(cufftPlan3d(&cufft_plan_fwd_, nz, ny, nx, CUFFT_Z2Z));
CUFFT_CHECK(cufftPlan3d(&cufft_plan_bwd_, nz, ny, nx, CUFFT_Z2Z));

gpu_allocated_ = true;
}

const int blocks_r = std::min((nrxx + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1024);
const int blocks_g = std::min((npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1024);

// d_result_ is double* but aliased as cuFFT complex buffer.
auto* d_fft = reinterpret_cast<double2*>(d_result_);

for (int is = 0; is < PARAM.inp.nspin; ++is) {
// Step 1: Copy input density H→D
syncmem_d2d_h2d_op()(d_rho_, prho[is], nrxx);

// Step 2: rho^exponent on GPU (eliminates CPU std::pow + extra H→D)
kedf_wt_rho_power<<<blocks_r, THREADS_PER_BLOCK>>>(
d_rho_, d_rho_, exponent, nrxx);
CHECK_CUDA_SYNC();

// Step 3: Real → Complex (double2 out-of-place)
kedf_wt_real_to_complex<<<blocks_r, THREADS_PER_BLOCK>>>(
d_rho_, d_fft, nrxx);
CHECK_CUDA_SYNC();

// Step 4: Forward FFT (in-place on d_fft)
CUFFT_CHECK(cufftExecZ2Z(cufft_plan_fwd_,
reinterpret_cast<cufftDoubleComplex*>(d_fft),
reinterpret_cast<cufftDoubleComplex*>(d_fft),
CUFFT_FORWARD));

// Step 5: Multiply by WT kernel in G-space (double2)
kedf_wt_recip_multiply<<<blocks_g, THREADS_PER_BLOCK>>>(
d_fft, d_kernel_, npw);
CHECK_CUDA_SYNC();

// Step 6: Inverse FFT (in-place on d_fft)
CUFFT_CHECK(cufftExecZ2Z(cufft_plan_bwd_,
reinterpret_cast<cufftDoubleComplex*>(d_fft),
reinterpret_cast<cufftDoubleComplex*>(d_fft),
CUFFT_INVERSE));

// Step 7: Complex → Real with 1/N normalization (double2)
kedf_wt_complex_to_real_norm<<<blocks_r, THREADS_PER_BLOCK>>>(
d_fft, d_rho_, inv_nrxx, nrxx);
CHECK_CUDA_SYNC();

// Step 8: D → H
syncmem_d2d_d2h_op()(rkernel_rho[is], d_rho_, nrxx);
}
}

void KEDF_WT::free_gpu_buffers()
{
if (!gpu_allocated_) { return; }

if (cufft_plan_fwd_ != 0) { cufftDestroy(cufft_plan_fwd_); cufft_plan_fwd_ = 0; }
if (cufft_plan_bwd_ != 0) { cufftDestroy(cufft_plan_bwd_); cufft_plan_bwd_ = 0; }

if (d_rho_ != nullptr) { delmem_dd_op()(d_rho_); d_rho_ = nullptr; }
if (d_result_ != nullptr) { delmem_dd_op()(d_result_); d_result_ = nullptr; }
if (d_kernel_ != nullptr) { delmem_dd_op()(d_kernel_); d_kernel_ = nullptr; }

gpu_allocated_ = false;
}
29 changes: 29 additions & 0 deletions tests/07_OFDFT/31_OF_KE_WT_GPU/INPUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
INPUT_PARAMETERS
#Parameters (1.General)
suffix autotest
calculation scf
esolver_type ofdft

device gpu

symmetry 1
pseudo_dir ../../PP_ORB/
pseudo_rcut 16
nspin 1
cal_force 1
test_force 1
cal_stress 1
test_stress 1

#Parameters (2.Iteration)
ecutwfc 20
scf_nmax 50

#OFDFT
of_kinetic wt
of_method tn
of_conv energy
of_tole 2e-6

#Parameters (3.Basis)
basis_type pw
4 changes: 4 additions & 0 deletions tests/07_OFDFT/31_OF_KE_WT_GPU/KPT
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
K_POINTS
0
Gamma
1 1 1 0 0 0
1 change: 1 addition & 0 deletions tests/07_OFDFT/31_OF_KE_WT_GPU/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Test the energy, force, and stress of Wang-Teter (WT) kinetic energy functional (of_method = wt) in OFDFT with GPU acceleration, symmetry=on
18 changes: 18 additions & 0 deletions tests/07_OFDFT/31_OF_KE_WT_GPU/STRU
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
ATOMIC_SPECIES
Al 26.98 al.lda.lps blps

LATTICE_CONSTANT
7.50241114482312 // add lattice constant

LATTICE_VECTORS
0.000000000000 0.500000000000 0.500000000000
0.500000000000 0.000000000000 0.500000000000
0.500000000000 0.500000000000 0.000000000000

ATOMIC_POSITIONS
Direct

Al
0
1
0.000000000000 0.000000000000 0.000000000000 1 1 1
8 changes: 8 additions & 0 deletions tests/07_OFDFT/31_OF_KE_WT_GPU/result.ref
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
etotref -57.9338551427919910
etotperatomref -57.9338551428
totalforceref 0.000000
totalstressref 29.613417
pointgroupref O_h
spacegroupref O_h
nksibzref 1
totaltimeref +0.28699
1 change: 1 addition & 0 deletions tests/07_OFDFT/CASES_GPU.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
31_OF_KE_WT_GPU
Loading