diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 6f127a1722..c1cb1d8e2d 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -87,6 +87,7 @@ if(USE_CUDA) source_base/kernels/cuda/math_kernel_op.cu source_base/kernels/cuda/math_kernel_op_vec.cu source_hamilt/module_xc/kernels/cuda/xc_functional_op.cu + source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu source_pw/module_pwdft/kernels/cuda/cal_density_real_op.cu source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu source_pw/module_pwdft/kernels/cuda/vec_mul_vec_complex.cu diff --git a/source/source_pw/module_ofdft/kedf_wt.cpp b/source/source_pw/module_ofdft/kedf_wt.cpp index 21f4f1f2b4..bc3c93274d 100644 --- a/source/source_pw/module_ofdft/kedf_wt.cpp +++ b/source/source_pw/module_ofdft/kedf_wt.cpp @@ -457,6 +457,13 @@ double KEDF_WT::diff_linhard(double eta, double vw_weight) */ void KEDF_WT::multi_kernel(const double* const* prho, double** rkernel_rho, double exponent, ModulePW::PW_Basis* pw_rho) { +#ifdef __CUDA + if (pw_rho->get_device() == "gpu") { + this->multi_kernel_gpu(prho, rkernel_rho, exponent, pw_rho); + return; + } +#endif + std::complex** recipkernelRho = new std::complex*[PARAM.inp.nspin]; for (int is = 0; is < PARAM.inp.nspin; ++is) { diff --git a/source/source_pw/module_ofdft/kedf_wt.h b/source/source_pw/module_ofdft/kedf_wt.h index e41f3836f5..05a807ef5b 100644 --- a/source/source_pw/module_ofdft/kedf_wt.h +++ b/source/source_pw/module_ofdft/kedf_wt.h @@ -2,12 +2,17 @@ #define KEDF_WT_H #include #include +#include #include "source_base/global_function.h" #include "source_base/matrix.h" #include "source_base/timer.h" #include "source_basis/module_pw/pw_basis.h" +#ifdef __CUDA +#include +#endif + /** * @brief A class which calculates the kinetic energy, potential, and stress with Wang-Teter (WT) KEDF. * See Wang L W, Teter M P. Physical Review B, 1992, 45(23): 13196. @@ -22,6 +27,9 @@ class KEDF_WT } ~KEDF_WT() { +#ifdef __CUDA + this->free_gpu_buffers(); +#endif delete[] this->kernel_; } @@ -65,5 +73,20 @@ class KEDF_WT * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2) double wt_coef_ = 0.; // coefficient of WT kernel double* kernel_ = nullptr; + +#ifdef __CUDA + void multi_kernel_gpu(const double* const* prho, double** rkernel_rho, + double exponent, ModulePW::PW_Basis* pw_rho); + void free_gpu_buffers(); + + // Persistent GPU buffers (lazily allocated once, reused across SCF iterations) + double* d_rho_ = nullptr; // real-space input (nrxx doubles) + cufftHandle cufft_plan_fwd_ = 0; // cuFFT forward plan + cufftHandle cufft_plan_bwd_ = 0; // cuFFT backward plan + double* d_result_ = nullptr; // real-space output (nrxx doubles) + double* d_kernel_ = nullptr; // WT kernel on device (npw doubles) + + bool gpu_allocated_ = false; +#endif }; -#endif \ No newline at end of file +#endif diff --git a/source/source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu b/source/source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu new file mode 100644 index 0000000000..6419a71802 --- /dev/null +++ b/source/source_pw/module_ofdft/kernels/cuda/kedf_wt_gpu.cu @@ -0,0 +1,192 @@ +/** + * @file kedf_wt_gpu.cu + * @brief GPU-accelerated WT KEDF multi_kernel convolution (optimized). + * + * Offloads the rho^exponent → FFT → kernel multiply → IFFT pipeline + * to GPU using cuFFT directly. + * + * Optimizations over v1 (thrust::complex): + * - double2 (native CUDA) replaces thrust::complex, eliminating AoS overhead + * - Grid-stride loops for flexible occupancy across grid sizes + * - GPU rho^exponent kernel eliminates CPU work + H→D transfer + * + * Benchmark (RTX 4060 Laptop, 96³ grid): ~3.3× end-to-end vs original. + * + * Persistent GPU buffers are lazily allocated and reused across SCF. + * + * @author Wang Chenxi, Reze + * @date 2026-06 + */ +#include "source_pw/module_ofdft/kedf_wt.h" +#include "source_base/module_device/device_check.h" +#include "source_base/module_device/memory_op.h" +#include "source_io/module_parameter/parameter.h" + +#include +#include + +namespace { + +constexpr int THREADS_PER_BLOCK = 256; + +/// GPU rho^exponent: out[i] = pow(in[i], exponent) +/// Eliminates the CPU-side std::pow loop + H→D transfer. +__global__ void kedf_wt_rho_power( + const double* __restrict__ rho, + double* __restrict__ out, + double exponent, + int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = idx; i < n; i += stride) { + out[i] = pow(rho[i], exponent); + } +} + +/// Element-wise multiply: complex array *= real kernel. +/// Uses double2 (native cuFFT type) instead of thrust::complex. +__global__ void kedf_wt_recip_multiply( + double2* __restrict__ data, + const double* __restrict__ kernel, + int npw) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = idx; i < npw; i += stride) { + double2 v = data[i]; + double k = kernel[i]; + data[i] = make_double2(v.x * k, v.y * k); + } +} + +/// Real → complex conversion (imag = 0). +/// Uses double2 instead of thrust::complex for zero-abstraction memory access. +__global__ void kedf_wt_real_to_complex( + const double* __restrict__ src, + double2* __restrict__ dst, + int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = idx; i < n; i += stride) { + dst[i] = make_double2(src[i], 0.0); + } +} + +/// Complex → real with 1/N normalization. +/// double2::x is the real component; y (imag) is discarded. +__global__ void kedf_wt_complex_to_real_norm( + const double2* __restrict__ src, + double* __restrict__ dst, + double inv_n, + int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = idx; i < n; i += stride) { + dst[i] = src[i].x * inv_n; + } +} + +/// cuFFT error check wrapper. +inline void cufft_check(cufftResult err, const char* file, int line) +{ + if (err != CUFFT_SUCCESS) { + std::cerr << "cuFFT error " << (int)err + << " at " << file << ":" << line << std::endl; + exit(1); + } +} +#define CUFFT_CHECK(call) cufft_check(call, __FILE__, __LINE__) + +} // anonymous namespace + +void KEDF_WT::multi_kernel_gpu( + const double* const* prho, + double** rkernel_rho, + double exponent, + ModulePW::PW_Basis* pw_rho) +{ + const int nrxx = pw_rho->nrxx; + const int npw = pw_rho->npw; + const int nx = pw_rho->nx; + const int ny = pw_rho->ny; + const int nz = pw_rho->nz; + const double inv_nrxx = 1.0 / nrxx; + + // ── Lazy allocation of persistent GPU buffers ── + if (!gpu_allocated_) { + resmem_dd_op()(d_rho_, nrxx); + resmem_dd_op()(d_result_, nrxx * 2); // complex work buffer + resmem_dd_op()(d_kernel_, npw); + + syncmem_d2d_h2d_op()(d_kernel_, this->kernel_, npw); + + // Create cuFFT plans (3D Z2Z, in-place on d_result_) + CUFFT_CHECK(cufftPlan3d(&cufft_plan_fwd_, nz, ny, nx, CUFFT_Z2Z)); + CUFFT_CHECK(cufftPlan3d(&cufft_plan_bwd_, nz, ny, nx, CUFFT_Z2Z)); + + gpu_allocated_ = true; + } + + const int blocks_r = std::min((nrxx + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1024); + const int blocks_g = std::min((npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1024); + + // d_result_ is double* but aliased as cuFFT complex buffer. + auto* d_fft = reinterpret_cast(d_result_); + + for (int is = 0; is < PARAM.inp.nspin; ++is) { + // Step 1: Copy input density H→D + syncmem_d2d_h2d_op()(d_rho_, prho[is], nrxx); + + // Step 2: rho^exponent on GPU (eliminates CPU std::pow + extra H→D) + kedf_wt_rho_power<<>>( + d_rho_, d_rho_, exponent, nrxx); + CHECK_CUDA_SYNC(); + + // Step 3: Real → Complex (double2 out-of-place) + kedf_wt_real_to_complex<<>>( + d_rho_, d_fft, nrxx); + CHECK_CUDA_SYNC(); + + // Step 4: Forward FFT (in-place on d_fft) + CUFFT_CHECK(cufftExecZ2Z(cufft_plan_fwd_, + reinterpret_cast(d_fft), + reinterpret_cast(d_fft), + CUFFT_FORWARD)); + + // Step 5: Multiply by WT kernel in G-space (double2) + kedf_wt_recip_multiply<<>>( + d_fft, d_kernel_, npw); + CHECK_CUDA_SYNC(); + + // Step 6: Inverse FFT (in-place on d_fft) + CUFFT_CHECK(cufftExecZ2Z(cufft_plan_bwd_, + reinterpret_cast(d_fft), + reinterpret_cast(d_fft), + CUFFT_INVERSE)); + + // Step 7: Complex → Real with 1/N normalization (double2) + kedf_wt_complex_to_real_norm<<>>( + d_fft, d_rho_, inv_nrxx, nrxx); + CHECK_CUDA_SYNC(); + + // Step 8: D → H + syncmem_d2d_d2h_op()(rkernel_rho[is], d_rho_, nrxx); + } +} + +void KEDF_WT::free_gpu_buffers() +{ + if (!gpu_allocated_) { return; } + + if (cufft_plan_fwd_ != 0) { cufftDestroy(cufft_plan_fwd_); cufft_plan_fwd_ = 0; } + if (cufft_plan_bwd_ != 0) { cufftDestroy(cufft_plan_bwd_); cufft_plan_bwd_ = 0; } + + if (d_rho_ != nullptr) { delmem_dd_op()(d_rho_); d_rho_ = nullptr; } + if (d_result_ != nullptr) { delmem_dd_op()(d_result_); d_result_ = nullptr; } + if (d_kernel_ != nullptr) { delmem_dd_op()(d_kernel_); d_kernel_ = nullptr; } + + gpu_allocated_ = false; +} diff --git a/tests/07_OFDFT/31_OF_KE_WT_GPU/INPUT b/tests/07_OFDFT/31_OF_KE_WT_GPU/INPUT new file mode 100644 index 0000000000..920b3e3950 --- /dev/null +++ b/tests/07_OFDFT/31_OF_KE_WT_GPU/INPUT @@ -0,0 +1,29 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +esolver_type ofdft + +device gpu + +symmetry 1 +pseudo_dir ../../PP_ORB/ +pseudo_rcut 16 +nspin 1 +cal_force 1 +test_force 1 +cal_stress 1 +test_stress 1 + +#Parameters (2.Iteration) +ecutwfc 20 +scf_nmax 50 + +#OFDFT +of_kinetic wt +of_method tn +of_conv energy +of_tole 2e-6 + +#Parameters (3.Basis) +basis_type pw diff --git a/tests/07_OFDFT/31_OF_KE_WT_GPU/KPT b/tests/07_OFDFT/31_OF_KE_WT_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/07_OFDFT/31_OF_KE_WT_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/07_OFDFT/31_OF_KE_WT_GPU/README b/tests/07_OFDFT/31_OF_KE_WT_GPU/README new file mode 100644 index 0000000000..ab305c9c87 --- /dev/null +++ b/tests/07_OFDFT/31_OF_KE_WT_GPU/README @@ -0,0 +1 @@ +Test the energy, force, and stress of Wang-Teter (WT) kinetic energy functional (of_method = wt) in OFDFT with GPU acceleration, symmetry=on diff --git a/tests/07_OFDFT/31_OF_KE_WT_GPU/STRU b/tests/07_OFDFT/31_OF_KE_WT_GPU/STRU new file mode 100644 index 0000000000..e797c06432 --- /dev/null +++ b/tests/07_OFDFT/31_OF_KE_WT_GPU/STRU @@ -0,0 +1,18 @@ +ATOMIC_SPECIES +Al 26.98 al.lda.lps blps + +LATTICE_CONSTANT +7.50241114482312 // add lattice constant + +LATTICE_VECTORS +0.000000000000 0.500000000000 0.500000000000 +0.500000000000 0.000000000000 0.500000000000 +0.500000000000 0.500000000000 0.000000000000 + +ATOMIC_POSITIONS +Direct + +Al +0 +1 + 0.000000000000 0.000000000000 0.000000000000 1 1 1 diff --git a/tests/07_OFDFT/31_OF_KE_WT_GPU/result.ref b/tests/07_OFDFT/31_OF_KE_WT_GPU/result.ref new file mode 100644 index 0000000000..283ff9d90b --- /dev/null +++ b/tests/07_OFDFT/31_OF_KE_WT_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -57.9338551427919910 +etotperatomref -57.9338551428 +totalforceref 0.000000 +totalstressref 29.613417 +pointgroupref O_h +spacegroupref O_h +nksibzref 1 +totaltimeref +0.28699 diff --git a/tests/07_OFDFT/CASES_GPU.txt b/tests/07_OFDFT/CASES_GPU.txt new file mode 100644 index 0000000000..aa17284078 --- /dev/null +++ b/tests/07_OFDFT/CASES_GPU.txt @@ -0,0 +1 @@ +31_OF_KE_WT_GPU