# GPU programs for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and sine Gordon equations

This section includes the programs taken from a conference paper by Cloutier, Muite and Rigge[1]. The main purpose is to give example programs which show how to use graphics processing units (GPUs) to solve partial differential equations using Fourier methods. For further background on GPUs and programming models for GPUs see Cloutier, Muite and Rigge[2]. It should be noted that the algorithms used for the sine Gordon equation are very similar to those for the Klein Gordon equation discussed elsewhere in this tutorial. For consistency with the rest of the tutorial, only programs using CUDA Fortran and OpenACC extensions to Fortran are included although Cloutier, Muite and Rigge[3] also has CUDA C programs. GPUs enable acceleration of Fourier pseudospectral codes by factors of 10 compared to OpenMP parallelizations on a single 8 core node.

## 2D Navier Stokes Equations

These programs use the Crank-Nicolson method.

( A)

A CUDA Fortran program to solve the 2D Navier-Stokes equations Code download
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! numthreads = number of CPUs used in calculation
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays (gpu) ..
! omeg_d = vorticity in real space
! omeghat_d = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat_d = 2D Fourier transform of vorticity at previous
! iterate
! nloldhat_d = nonlinear term in Fourier space
! at previous iterate
! psihat_d = 2D Fourier transform of streamfunction
! at next iteration
! temp1_d/temp2_d/temp3_d = reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors (gpu) ..
! kx_d = fourier frequencies in x direction
! ky_d = fourier frequencies in y direction
! x_d = x locations
! y_d = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! This program has not been fully optimized.
!--------------------------------------------------------------------
module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z

interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value:: plan
complex(fp_kind),device:: idata(1:nx,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D

end module cufft

PROGRAM main
use precision
use cufft
! declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=4096
INTEGER(kind=4), PARAMETER :: Ny=4096
INTEGER(kind=8) :: temp=10000000
REAL(fp_kind), PARAMETER :: dt=0.000125d0 !dt=0.000002d0
REAL(fp_kind), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Re=1.0d0
REAL(fp_kind), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(fp_kind), PARAMETER :: tol=0.1d0**10
REAL(fp_kind) :: scalemodes,chg
INTEGER(kind=4), PARAMETER :: nplots=1,plotgap=20
REAL(fp_kind),DIMENSION(:), ALLOCATABLE :: x,y
REAL(fp_kind),DIMENSION(:,:), ALLOCATABLE :: omeg,omegexact
INTEGER(kind=4) :: i,j,n,t, AllocateStatus
INTEGER(kind=4) :: planz2d,pland2z, kersize
!variables used for saving data and timing
INTEGER(kind=4) :: start, finish, count_rate,count, iol
CHARACTER*100 :: name_config
! Declare variables for GPU
REAL(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omeg_d,nl_d, temp2_d,&
temp3_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omegoldhat_d, nloldhat_d,&
omeghat_d, nlhat_d, psihat_d,&
temp1_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(kind=8),DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d

kersize=min(Nx,256)
PRINT *,'Program starting'
PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt
ALLOCATE(x(1:Nx),y(1:Ny),omeg(1:Nx,1:Ny),omegexact(1:Nx,1:Ny),&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'
ALLOCATE(kx_d(1:Nx/2+1),ky_d(1:Ny),x_d(1:Nx),y_d(1:Ny),omeg_d(1:Nx,1:Ny),&
omegoldhat_d(1:Nx/2+1,1:Ny),nloldhat_d(1:Nx/2+1,1:Ny),&
omeghat_d(1:Nx/2+1,1:Ny),nl_d(1:Nx,1:Ny),&
nlhat_d(1:Nx/2+1,1:Ny),psihat_d(1:Nx/2+1,1:Ny),temp1_d(1:Nx/2+1,1:Ny),&
temp2_d(1:Nx,1:Ny),temp3_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'
CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)
PRINT *,'Setup FFTs'

! setup fourier frequencies
!$cuf kernel do <<< *,* >>> DO i=1,Nx/2+1 kx_d(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind=fp_kind) END DO kx_d(1+Nx/2)=0.0d0 !$cuf kernel do <<< *,* >>>
DO i=1,Nx
x_d(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind)
END DO
!$cuf kernel do <<< *,* >>> DO j=1,Ny/2+1 ky_d(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind=fp_kind) END DO ky_d(1+Ny/2)=0.0d0 !$cuf kernel do <<< *,* >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>> DO j=1,Ny y_d(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind) END DO scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind) PRINT *,'Setup grid and fourier frequencies' !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
omeg_d(i,j)=4.0d0*pi*sin(2.0d0*pi*x_d(i))*sin(2.0d0*pi*y_d(j))!+0.01d0*cos(2.0d0*pi*y_d(j))
END DO
END DO
CALL cufftExecD2Z(pland2z,omeg_d,omeghat_d)

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get initial nonlinear term using omeghat to find psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14) END DO END DO !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u

!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*kx_d(i) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j)
END DO
END DO

!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*ky_d(j)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y

!combine to get full nonlinear term in real space
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes END DO END DO CALL cufftExecD2Z(pland2z,nl_d,nlhat_d) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! temp2_d=omeg_d !omegacheck PRINT *,'Got initial data, starting timestepping' CALL system_clock(start,count_rate) DO t=1,nplots DO n=1,plotgap chg=1.0d0 nloldhat_d=nlhat_d omegoldhat_d=omeghat_d DO WHILE (chg>tol) !$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
omeghat_d(i,j)=((dtInv+0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))&
*omegoldhat_d(i,j) - 0.5d0*(nloldhat_d(i,j)+nlhat_d(i,j)))&
/(dtInv-0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))
END DO
END DO
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14) END DO END DO CALL cufftExecZ2D(planz2d,omeghat_d,omeg_d) !check for convergence chg=0.0d0 !$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg_d(i,j)-temp2_d(i,j))*(omeg_d(i,j)-temp2_d(i,j))&
*scalemodes*scalemodes
END DO
END DO

!!!!!!!!!!!!!!!!
!nonlinear term!
!!!!!!!!!!!!!!!!
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u !$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*kx_d(i)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j) END DO END DO !$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*ky_d(j) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y !combine to get full nonlinear term in real space !$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes
END DO
END DO
CALL cufftExecD2Z(pland2z,nl_d,nlhat_d)
!!!!!!!!!!!!!!!!

temp2_d=omeg_d !save omegacheck
END DO
END DO
!PRINT *, t*plotgap*dt
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),&
'for Time stepping'

! Copy grid back to host
x=x_d
y=y_d
omeg=omeg_d

!get exact omega
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))

temp=temp+1
write(name_config,'(a,i0,a)') 'omega',temp,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)

CALL cufftDestroy(planz2d)
CALL cufftDestroy(pland2z)
PRINT *,'Destroyed fft plan'

DEALLOCATE(x,y,omeg,omegexact,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated CPU memory'

DEALLOCATE(kx_d,ky_d,x_d,y_d,&
omeg_d,omegoldhat_d, nloldhat_d,omeghat_d,&
nl_d, nlhat_d,temp1_d,temp2_d,temp3_d,&
psihat_d,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated GPU memory'
PRINT *,'Program execution complete'
END PROGRAM main


( B)

. An OpenACC Fortran program to solve the 2D Navier-Stokes equations} Code download
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! scalemodes = 1/(Nx*Ny), scaling after preforming FFTs
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays ..
! omeg = vorticity in real space
! omeghat = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat = 2D Fourier transform of vorticity at previous
! iterate
! nl = nonlinear term
! nlhat = nonlinear term in Fourier space
! nloldhat = nonlinear term in Fourier space
! at previous iterate
! omegexact = taylor-green vorticity at
! at final step
! psihat = 2D Fourier transform of streamfunction
! at next iteration
! temp1/temp2/temp3= reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx/2+1,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z

interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value:: plan
complex(fp_kind),device:: idata(1:nx/2+1,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft

PROGRAM main
USE precision
USE cufft
USE openacc

IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=512
INTEGER(kind=4), PARAMETER :: Ny=512
REAL(kind=8), PARAMETER	:: dt=0.000125d0
REAL(kind=8), PARAMETER	:: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(kind=8), PARAMETER	&
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER	:: Re=1.0d0
REAL(kind=8), PARAMETER	::	ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(kind=8), PARAMETER	::	tol=0.1d0**10
REAL(kind=8)	:: scalemodes
REAL(kind=8)	:: chg
INTEGER(kind=4), PARAMETER	::	nplots=1, plotgap=20
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE	:: kx,ky
REAL(kind=8), DIMENSION(:), ALLOCATABLE	:: x,y,time
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE	:: omeg,nl, temp2, temp3, omegexact
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE	:: omegoldhat, nloldhat,&
omeghat,nlhat, psihat,temp1
INTEGER(kind=4)	:: i,j,n,t, allocatestatus
INTEGER(kind=4)	:: pland2z,planz2d
INTEGER(kind=4)	:: count, iol
CHARACTER*100	:: name_config
INTEGER(kind=4)	:: start, finish, count_rate
INTEGER(kind=4)	:: ierr,vecsize,gangsize
INTEGER(kind=8)	:: planfxy,planbxy

vecsize=32
gangsize=16
PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt
ALLOCATE(time(1:nplots+1),kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),&
omeg(1:Nx,1:Ny),omegoldhat(1:Nx/2+1,1:Ny),&
nloldhat(1:Nx/2+1,1:Ny),temp3(1:Nx,1:Ny),omeghat(1:Nx/2+1,1:Ny),&
nl(1:Nx,1:Ny),nlhat(1:Nx/2+1,1:Ny), psihat(1:Nx/2+1,1:Ny),&
temp1(1:Nx/2+1,1:Ny),omegexact(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'allocated space'

CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)

PRINT *,'Setup 2D FFTs'

! setup fourier frequencies in x-direction
!$acc data copy(kx,ky,x,y,time,temp3,omeg,nl,temp1,temp2,omegoldhat,nloldhat,omeghat,nlhat,psihat) PRINT *, 'Copied arrays over to device' !$acc kernels loop
DO i=1,Nx/2+1
kx(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))
END DO
!$acc end kernels kx(1+Nx/2)=0.0d0 !$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels !$acc kernels loop
DO i=1,Nx
x(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0))
END DO
!$acc end kernels ! setup fourier frequencies in y-direction !$acc kernels loop
DO j=1,Ny/2+1
ky(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))
END DO
!$acc end kernels ky(1+Ny/2)=0.0d0 !$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels !$acc kernels loop
DO j=1,Ny
y(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0))
END DO
!$acc end kernels scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0)) PRINT *,'Setup grid and fourier frequencies' !initial data !$acc kernels loop
DO j=1,Ny
DO i=1,NX
omeg(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*sin(2.0d0*pi*y(j))!+0.01d0*cos(2.0d0*pi*y(j))
END DO
END DO
!$acc end kernels !\hat{\omega^{n,k}} CALL cufftExecD2Z(pland2z,omeg,omeghat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !get initial nonlinear term using omeghat, psihat, u, and v! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\hat{\psi^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels !\omega^{n+1,k+1} CALL cufftExecZ2D(planz2d,omeghat,omeg) !get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO
!$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !u ! \hat{\omega_x^{n,k}} !$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels ! \omega_x^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! first part nonlinear term in real space !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels !get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !v ! \hat{\omega_y^{n,k}} !$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels ! \omega_y^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! get the rest of nonlinear term in real space !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels ! transform nonlinear term into fourier space CALL cufftExecD2Z(pland2z,nl,nlhat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=omeg(i,j)
END DO
END DO
!$acc end kernels PRINT *,'Got initial data, starting timestepping' time(1)=0.0d0 CALL system_clock(start,count_rate) DO t=1,nplots DO n=1,plotgap chg=1.0d0 ! save old values(__^{n,k} terms in equation) !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
nloldhat(i,j)=nlhat(i,j)
END DO
END DO
!$acc end kernels !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omegoldhat(i,j)=omeghat(i,j)
END DO
END DO
!$acc end kernels DO WHILE (chg>tol) !Crank-Nicolson timestepping to get \hat{\omega^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omeghat(i,j)=( (dtInv+0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))&
*omegoldhat(i,j) - 0.5d0*(nloldhat(i,j)+nlhat(i,j)))/&
(dtInv-0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))
END DO
END DO
!$acc end kernels CALL cufftExecZ2D(planz2d,omeghat,omeg) ! check for convergence chg=0.0d0 !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg(i,j)-temp2(i,j))*(omeg(i,j)-temp2(i,j))&
*scalemodes*scalemodes
END DO
END DO
!$acc end kernels !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !get nonlinear term using omeghat, psihat, u, and v! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\hat{\psi^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels !\omega^{n+1,k+1} CALL cufftExecZ2D(planz2d,omeghat,omeg) !get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO
!$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !u ! \hat{\omega_x^{n,k}} !$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels ! \omega_x^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! first part nonlinear term in real space !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels !get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) ! \hat{\omega_y^{n,k}} !$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels ! \omega_y^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! get the rest of nonlinear term in real space !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels ! transform nonlinear term into fourier space CALL cufftExecD2Z(pland2z,nl,nlhat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\omega^{n+1,k+1} is saved for next iteration !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=omeg(i,j)
END DO
END DO
!$acc end kernels END DO END DO time(t+1)=time(t)+dt*plotgap !PRINT *, time(t+1) END DO CALL system_clock(finish,count_rate) PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),& 'for Time stepping' !get exact omega !$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!$acc end kernels !$acc end data

!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))

!!!!!!!!!!!!!!!!!!!!!!!!
!copy over data to disk!
!!!!!!!!!!!!!!!!!!!!!!!!
write(name_config,'(a,i0,a)') 'omega',1,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)

name_config = 'time.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nplots+1
WRITE(11,*) time(i)
END DO
CLOSE(11)

name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)

name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
!!!!!!!!!!!!!!!!!!!!!!!!

CALL cufftDestroy(pland2z)
CALL cufftDestroy(planz2d)

DEALLOCATE(time,temp1,temp2,temp3,kx,ky,x,y,&
omeg,omegoldhat,omegexact, nloldhat,&
omeghat,nl, nlhat, psihat,&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Program execution complete'

END PROGRAM main


## 2D Cubic Nonlinear Schrödinger Equations

These programs use splitting.

( C)

A CUDA Fortran program to solve the 2D Nonlinear Schrödinger equation Code download
	!--------------------------------------------------------------------
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(x=2*L*\pi,y)
! and u(x,y=0)=u(x,y=2*L*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! pi = 3.14159265358979323846264338327950288419716939937510d0
! L = width of box
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! plan = fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! u_d = approximate solution on device
! v_d = Fourier transform of approximate solution on device
! temp1_d = temporary array used to find mass and energy
! temp2_d = temporary array used to find mass and energy
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/calling-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
!
! External libraries required
! cufft -- Cuda FFT library
!

!
! Define the INTERFACE to the NVIDIA CUFFT routines
!

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z

end module cufft

PROGRAM main
use precision
use cufft
! Declare host variables and scalars
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=1024
INTEGER(kind=4), PARAMETER :: Nt=20
INTEGER(kind=4), PARAMETER :: plotgap=5
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Lx=5.0d0
REAL(fp_kind), PARAMETER :: Ly=5.0d0
REAL(fp_kind), PARAMETER :: Es=1.0d0
REAL(fp_kind) :: dt=0.10d0**5
REAL(fp_kind) :: scalemodes
COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x,y
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time
INTEGER(kind=4) :: i,j,k,n,modes,AllocateStatus,plan, kersize
INTEGER(kind=4) :: start, finish, count_rate
CHARACTER*100 :: name_config
! Declare variables for GPU
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: u_d,v_d,temp1_d,temp2_d
REAL(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: time_d

! start timing
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny
! allocate arrays on the host
ALLOCATE(x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),time(1:Nt+1),&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'

! allocate arrays on the device
ALLOCATE(kx_d(1:Nx),ky_d(1:Nx),x_d(1:Nx),y_d(1:Nx),&
u_d(1:Nx,1:Ny),v_d(1:Nx,1:Ny),time_d(1:Nt+1),&
temp1_d(1:Nx,1:Ny),temp2_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'

kersize=min(Nx,256)
! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$cuf kernel do <<< *, * >>> DO i=1,1+Nx/2 kx_d(i)= cmplx(0.0d0,1.0d0)*(i-1.0d0)/Lx END DO kx_d(1+Nx/2)=0.0d0 !$cuf kernel do <<< *, * >>>
DO i = 1,Nx/2 -1
kx_d(i+1+Nx/2)=-kx_d(1-i+Nx/2)
END DO
!$cuf kernel do <<< *, * >>> DO i=1,Nx x_d(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind))*pi*Lx END DO !$cuf kernel do <<< *, * >>>
DO j=1,1+Ny/2
ky_d(j)= cmplx(0.0d0,1.0d0)*(j-1.0d0)/Ly
END DO
ky_d(1+Ny/2)=0.0d0
!$cuf kernel do <<< *, * >>> DO j = 1,Ny/2 -1 ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2) END DO !$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind))*pi*Ly
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'

!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx u_d(i,j)=exp(-1.0d0*(x_d(i)**2+y_d(j)**2)) END DO END DO ! transform initial data CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD) PRINT *,'Got initial data' ! get initial mass !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InMass=temp2_d(1,1)
! Get initial energy
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD) InEner=temp2_d(1,1) !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD) InEner=InEner+temp1_d(1,1) !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD) InEner=InEner+temp1_d(1,1) ! start timing CALL system_clock(start,count_rate) ! Do first half time step CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD) !$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*( kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j) )&
*cmplx(0.0d0,0.50d0) )*v_d(i,j)
END DO
END DO

PRINT *,'Starting timestepping'
time(1)=0.0d0
DO n=1,Nt
time_d(n+1)=n*dt
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>> DO j=1,Ny DO i=1,Nx v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2 END DO END DO !$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO

CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>> DO j=1,Ny DO i=1,Nx v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))& *cmplx(0.0d0,1.0d0))*v_d(i,j) END DO END DO END DO ! transform back final data and do another half time step CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE) !$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>> DO j=1,Ny DO i=1,Nx u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))& *u_d(i,j)*scalemodes END DO END DO CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD) !$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,0.50d0))*v_d(i,j)
END DO
END DO
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
! normalize
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>> DO j=1,Ny DO i=1,Nx u_d(i,j)=u_d(i,j)*scalemodes END DO END DO CALL system_clock(finish,count_rate) PRINT*,'Program took ',& REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),'s for execution' PRINT *,'Finished time stepping' ! calculate final mass !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiMass=temp2_d(1,1)

PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind=fp_kind)/REAL(InMass,kind=fp_kind))

! Get final energy
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD) FiEner=temp2_d(1,1) !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD) FiEner=FiEner+temp1_d(1,1) !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2 END DO END DO ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD) FiEner=FiEner+temp1_d(1,1) PRINT*,'Initial energy',InEner PRINT*,'Final energy',FiEner PRINT*,'Final Energy/Initial Energy', & ABS(REAL(FiEner,kind=fp_kind)/REAL(InEner,kind=fp_kind)) ! Copy results back to host u=u_d time=time_d x=x_d y=y_d name_config = 'ufinal.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny DO i=1,Nx WRITE(11,*) abs(u(i,j))**2 END DO END DO CLOSE(11) name_config = 'tdata.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,1+Nt/plotgap WRITE(11,*) time(j) END DO CLOSE(11) name_config = 'xcoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO i=1,Nx WRITE(11,*) x(i) END DO CLOSE(11) name_config = 'ycoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny WRITE(11,*) y(j) END DO CLOSE(11) PRINT *,'Saved data' ! Destroy the plan CALL cufftDestroy(plan) DEALLOCATE(kx_d,ky_d,x_d,y_d,& u_d,v_d,time_d,& temp1_d,temp2_d,& stat=AllocateStatus) IF (allocatestatus .ne. 0) STOP DEALLOCATE(x,y,u,time,& stat=AllocateStatus) IF (allocatestatus .ne. 0) STOP PRINT *,'deallocated memory' PRINT *,'Program execution complete' END PROGRAM main  ( D) An OpenACC Fortran program to solve the 2D Nonlinear Schrödinger equation Code download !-------------------------------------------------------------------- ! ! ! PURPOSE ! ! This program solves nonlinear Schrodinger equation in 2 dimensions ! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0 ! using a second order time spectral splitting scheme ! ! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y), ! u(x,y=0)=u(x,y=2*Ly*\pi) ! The initial condition is u=exp(-x^2-y^2) ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! Nt = number of timesteps to take ! Tmax = maximum simulation time ! plotgap = number of timesteps between plots ! FFTW_IN_PLACE = value for FFTW input ! FFTW_MEASURE = value for FFTW input ! FFTW_EXHAUSTIVE = value for FFTW input ! FFTW_PATIENT = value for FFTW input ! FFTW_ESTIMATE = value for FFTW input ! FFTW_FORWARD = value for FFTW input ! FFTW_BACKWARD = value for FFTW input ! pi = 3.14159265358979323846264338327950288419716939937510d0 ! Lx = width of box in x direction ! Ly = width of box in y direction ! ES = +1 for focusing and -1 for defocusing ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! numthreads = number of openmp threads ! ierr = error return code ! start = variable to record start time of program ! finish = variable to record end time of program ! count_rate = variable for clock count rate ! planfxy = Forward 2d fft plan ! planbxy = Backward 2d fft plan ! dt = timestep ! InMass = initial mass ! FiMass = final mass ! InEner = initial energy ! FiEner = final energy ! .. Arrays .. ! u = approximate solution ! v = Fourier transform of approximate solution ! temp1 = temporary field ! temp2 = temporary field ! .. Vectors .. ! kx = fourier frequencies in x direction ! ky = fourier frequencies in y direction ! x = x locations ! y = y locations ! time = times at which save data ! name_config = array to store filename for data to be saved ! ! REFERENCES ! ! This program is based on example code to demonstrate usage of Fortran and ! CUDA FFT routines taken from ! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html ! ! and ! ! http://cudamusing.blogspot.com/search?q=cublas ! ! ACKNOWLEDGEMENTS ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External routines required ! precision ! cufft ! ! External libraries required ! CuFFT -- Cuda FFT Library ! OpenACC ! ! Define the INTERFACE to the NVIDIA CUFFT routines ! module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecZ2Z(cufftHandle plan, ! cufftDoubleComplex *idata, ! cufftDoubleComplex *odata, ! int direction; ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecZ2Z subroutine cufftExecZ2Z(plan, idata, odata, direction) & & bind(C,name='cufftExecZ2Z') use iso_c_binding use precision integer(c_int),value:: direction integer(c_int),value:: plan complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata end subroutine cufftExecZ2Z end interface cufftExecZ2Z end module cufft PROGRAM main USE precision USE cufft USE openacc ! Declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=128 INTEGER(kind=4), PARAMETER :: Ny=128 INTEGER(kind=4), PARAMETER :: Nt=20 INTEGER(kind=4), PARAMETER :: plotgap=20 REAL(fp_kind), PARAMETER :: & pi=3.14159265358979323846264338327950288419716939937510d0 REAL(fp_kind), PARAMETER :: Lx=5.0d0 REAL(fp_kind), PARAMETER :: Ly=5.0d0 REAL(fp_kind), PARAMETER :: Es=1.0d0 REAL(fp_kind) :: dt=0.10d0**5 REAL(fp_kind) :: scalemodes COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: kx COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: ky REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: y COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u,v,temp1,temp2 REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time INTEGER(kind=4) :: i,j,k,n,allocatestatus,ierr, vecsize,gangsize REAL(fp_kind) :: start_time,stop_time INTEGER(kind=4) :: plan CHARACTER*100 :: name_config vecsize=32 gangsize=16 PRINT *,'Program starting' PRINT *,'Grid: ',Nx,'X',Ny ALLOCATE(kx(1:Nx),ky(1:Nx),x(1:Nx),y(1:Nx),u(1:Nx,1:Ny),& v(1:Nx,1:Ny),temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& time(1:1+Nt/plotgap),stat=allocatestatus) IF (allocatestatus .ne. 0) stop PRINT *,'allocated memory' !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)

! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'

! setup fourier frequencies
!$acc kernels loop DO i=1,1+Nx/2 kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx END DO !$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop DO i = 1,Nx/2 -1 kx(i+1+Nx/2)=-kx(1-i+Nx/2) END DO !$acc end kernels
!$acc kernels loop DO i=1,Nx x(i)=(-1.0d0+2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)) )*pi*Lx END DO !$acc end kernels
!$acc kernels loop DO j=1,1+Ny/2 ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly END DO !$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop DO j = 1,Ny/2 -1 ky(j+1+Ny/2)=-ky(1-j+Ny/2) END DO !$acc end kernels
!$acc kernels loop DO j=1,Ny y(j)=(-1.0d0+2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)) )*pi*Ly END DO !$acc end kernels
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
PRINT *,'Setup grid and fourier frequencies'
!$acc kernels loop DO j=1,Ny DO i=1,Nx u(i,j)=exp(-1.0d0*(x(i)**2 +y(j)**2)) END DO END DO !$acc end kernels
! transform initial data
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)

PRINT *,'Got initial data'
! get initial mass
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=abs(u(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data InMass=temp2(1,1) ! Get initial energy !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data InEner=temp2(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=kx(i)*v(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data InEner=InEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=ky(j)*v(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data InEner=InEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
CALL cpu_time(start_time)

! transform initial data and do first half time step
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels
PRINT *,'Got initial data, starting timestepping'
time(1)=0.0d0
DO n=1,Nt
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2 END DO END DO !$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v(i,j))& *u(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels
IF (mod(n,plotgap)==0) then
time(1+n/plotgap)=n*dt
PRINT *,'time',n*dt
END IF
END DO
! transform back final data and do another half time step
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2 END DO END DO !$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=exp(cmplx(0,-1)*dt*v(i,j))*u(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=u(i,j)*scalemodes END DO END DO !$acc end kernels
PRINT *,'Finished time stepping'
CALL cpu_time(stop_time)
!$acc end data PRINT*,'Program took ',stop_time-start_time,& 'for Time stepping' !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)

! calculate final mass
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=abs(u(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data FiMass=temp2(1,1) ! Get final energy !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data FiEner=temp2(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=kx(i)*v(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data FiEner=FiEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=ky(j)*v(i,j)*scalemodes END DO END DO !$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data FiEner=FiEner+temp1(1,1) PRINT *,'Results copied back to host' PRINT*,'Initial mass',InMass PRINT*,'Final mass',FiMass PRINT*,'Final Mass/Initial Mass', & ABS(REAL(FiMass,kind(0d0))/REAL(InMass,kind(0d0))) PRINT*,'Initial energy',InEner PRINT*,'Final energy',FiEner PRINT*,'Final Energy/Initial Energy', & ABS(REAL(FiEner,kind(0d0))/REAL(InEner,kind(0d0))) name_config = 'ufinal.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny DO i=1,Nx WRITE(11,*) abs(u(i,j))**2 END DO END DO CLOSE(11) name_config = 'tdata.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,1+Nt/plotgap WRITE(11,*) time(j) END DO CLOSE(11) name_config = 'xcoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO i=1,Nx WRITE(11,*) x(i) END DO CLOSE(11) name_config = 'ycoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny WRITE(11,*) y(j) END DO CLOSE(11) PRINT *,'Saved data' CALL cufftDestroy(plan) DEALLOCATE(u,v,temp1,temp2,time,kx,ky,x,y,stat=allocatestatus) IF (allocatestatus .ne. 0) STOP PRINT *,'Deallocated memory' PRINT *,'Program execution complete' END PROGRAM main  ## 2D sine-Gordon Equations These programs use a semi-explicit method that is similar to that used for the Klein-Gordon equation. Only the main program is included here, and the auxiliary subroutines can be downloaded from Cloutier, Muite and Rigge[4] ( E) A CUDA Fortran program to solve the 2D sine-Gordon equation Code download !-------------------------------------------------------------------- ! ! ! PURPOSE ! ! This program solves nonlinear sine-Gordon equation in 2 dimensions ! u_{tt}-u_{xx}-u_{yy}=-sin(u) ! using a second order implicit-explicit time stepping scheme. ! ! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y), ! u(x,y=0)=u(x,y=2*Ly*\pi) ! The initial condition is set in initialdata.f90 ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! Nt = number of timesteps to take ! plotgap = number of timesteps between plots ! FFTW_IN_PLACE = value for FFTW input ! FFTW_MEASURE = value for FFTW input ! FFTW_EXHAUSTIVE = value for FFTW input ! FFTW_PATIENT = value for FFTW input ! FFTW_ESTIMATE = value for FFTW input ! FFTW_FORWARD = value for FFTW input ! FFTW_BACKWARD = value for FFTW input ! pi = 3.1415926535... ! Lx = width of box in x direction ! Ly = width of box in y direction ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! start = variable to record start time of program ! finish = variable to record end time of program ! count_rate = variable for clock count rate ! planfxy = Forward 2d fft plan (FFTW) ! planbxy = Backward 2d fft plan (FFTW) ! planf = Forward 2d fft plan (CUFFT) ! planb = Backward 2d fft plan (CUFFT) ! dt = timestep ! ierr = error code ! plotnum = number of plot ! .. Arrays .. ! u = approximate solution ! uold = approximate solution ! u_d = approximate solution (on GPU) ! v_d = Fourier transform of approximate solution (on GPU) ! uold_d = approximate solution (on GPU) ! vold_d = Fourier transform of approximate solution (on GPU) ! nonlinhat_d = Fourier transform of nonlinear term, sin(u) (on GPU) ! temp1 = extra space for energy computation ! temp2 = extra space for energy computation ! savearray = temp array to save out to disk ! .. Vectors .. ! kx = Fourier frequencies in x direction ! ky = Fourier frequencies in y direction ! kx_d = Fourier frequencies in x direction (on GPU) ! ky_d = Fourier frequencies in y direction (on GPU) ! x = x locations ! y = y locations ! time = times at which save data ! en = total energy ! enstr = strain energy ! enpot = potential energy ! enkin = kinetic energy ! name_config = array to store filename for data to be saved ! ! REFERENCES ! ! ACKNOWLEDGEMENTS ! ! This program is based on example code to demonstrate usage of Fortran and ! CUDA FFT routines taken from ! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html ! ! and ! ! http://cudamusing.blogspot.com/search?q=cublas ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External routines required ! getgrid.f90 -- Get initial grid of points ! initialdata.f90 -- Get initial data ! enercalc.f90 -- Subroutine to calculate the energy ! savedata.f90 -- Save initial data ! External libraries required ! Cuda FFT -- http://developer.nvidia.com/cufft ! FFTW3 -- Fastest Fourier Transform in the West ! (http://www.fftw.org/) module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision ! integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleReal *idata, ! cufftDoubleComplex *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecD2Z subroutine cufftExecD2Z(plan, idata, odata) & & bind(C,name='cufftExecD2Z') use iso_c_binding use precision integer(c_int), value :: plan real(fp_kind), device :: idata(1:nx,1:ny) complex(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecD2Z end interface cufftExecD2Z !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleComplex *idata, ! cufftDoubleReal *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecZ2D subroutine cufftExecZ2D(plan, idata, odata) & & bind(C,name='cufftExecZ2D') use iso_c_binding use precision integer(c_int),value :: plan complex(fp_kind),device:: idata(1:nx,1:ny) real(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecZ2D end interface cufftExecZ2D end module cufft PROGRAM sg2d USE precision USE cudafor USE cufft ! Declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=1024 INTEGER(kind=4), PARAMETER :: Ny=Nx INTEGER(kind=4), PARAMETER :: Nt=500 INTEGER(kind=4), PARAMETER :: plotgap=Nt+1 REAL(kind=8), PARAMETER :: & pi=3.14159265358979323846264338327950288419716939937510d0 REAL(kind=8), PARAMETER :: Lx=5.0d0 REAL(kind=8), PARAMETER :: Ly=5.0d0 REAL(kind=8) :: dt=0.001d0 COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2 REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en REAL(kind=8) :: scalemodes INTEGER(kind=4) :: ierr,i,j,n,allocatestatus INTEGER(kind=4) :: start, finish, count_rate, plotnum INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, & FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64 INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1 INTEGER(kind=8) :: planfxy,planbxy CHARACTER*100 :: name_config INTEGER(kind=4) :: planf,planb,kersize ! GPU variables COMPLEX(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: kx_d,ky_d COMPLEX(fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: vold_d,v_d,nonlinhat_d REAL (fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: uold_d,u_d ! print run information PRINT *,"Nx=", Nx PRINT *,"Ny=", Ny PRINT *,"Nt=", Nt PRINT *,"Lx=", Lx PRINT *,"Ly=", Ly PRINT *,"dt=", dt kersize=min(Nx,256) ALLOCATE(kx(1:Nx),ky(1:Ny),kx_d(1:Nx),ky_d(1:Ny),x(1:Nx),y(1:Ny),& u(1:Nx,1:Ny),uold(1:Nx,1:Ny),u_d(1:Nx,1:Ny),uold_d(1:Nx,1:Ny),& v_d(1:Nx/2+1,1:Ny),vold_d(1:Nx/2+1,1:Ny),& savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),& enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),& nonlinhat_d(1:Nx/2+1,1:Ny),& temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& stat=allocatestatus) IF (allocatestatus .ne. 0) stop PRINT *,'allocated arrays' scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0)) ! set up cuda ffts call cufftPlan2D(planf,nx,ny,CUFFT_D2Z) call cufftPlan2D(planb,nx,ny,CUFFT_Z2D) ! set up fftw ffts CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE) CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE) PRINT *,'Setup FFTs' ! setup grid, wave numbers CALL getgrid(Nx,Ny,Lx,Ly,pi,name_config,x,y,kx,ky) kx_d=kx ky_d=ky PRINT *,'Got grid and fourier frequencies' CALL initialdata(Nx,Ny,x,y,u,uold) u_d=u uold_d=uold plotnum=1 name_config = 'data/u' savearray=REAL(u) ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking PRINT *,'data saved' CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),& enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold) call cufftExecD2Z(planf,u_d,v_d) call cufftExecD2Z(planf,uold_d,vold_d) PRINT *,'Got initial data, starting timestepping' time(plotnum)=0.0d0 CALL system_clock(start,count_rate) DO n=1,Nt !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
uold_d(i,j)=u_d(i,j)
END DO
END DO
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx u_d(i,j)=sin(u_d(i,j)) END DO END DO call cufftExecD2Z(planf,u_d,nonlinhat_d) !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
nonlinhat_d(i,j)=scalemodes*( 0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*(2.0d0*v_d(i,j)+vold_d(i,j))&
+(2.0d0*v_d(i,j)-vold_d(i,j))/(dt*dt)&
-nonlinhat_d(i,j) )&
/(1/(dt*dt)-0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j)))
END DO
END DO
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 vold_d(i,j)=v_d(i,j) END DO END DO !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
v_d(i,j)=nonlinhat_d(i,j)/scalemodes
END DO
END DO
call cufftExecZ2D(planb,nonlinhat_d,u_d)
IF (mod(n,plotgap)==0) then
plotnum=plotnum+1
time(plotnum)=n*dt
PRINT *,'time',n*dt
u=u_d
uold=uold_d
! savearray=REAL(u,kind(0d0)) ! disabled for benchmarking
! CALL savedata(Nx,Ny,plotnum,name_config,savearray)
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold)
END IF
END DO
CALL system_clock(finish,count_rate)
PRINT *,'Finished time stepping'
u=u_d
uold=uold_d
! compute energy at the end
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u,uold)

PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
'for Time stepping'
CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))

! Save times at which output was made in text format
PRINT *,'Saved data'

call cufftDestroy(planf)
call cufftDestroy(planb)
PRINT *,'Destroy CUFFT Plans'
call dfftw_destroy_plan_(planbxy)
call dfftw_destroy_plan_(planfxy)
PRINT *,'Destroy FFTW Plans'
DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated host arrays'
DEALLOCATE(uold_d,vold_d,u_d,v_d,nonlinhat_d,&
kx_d,ky_d,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated gpu arrays'
PRINT *,'Program execution complete'
END PROGRAM sg2d


( F)

An OpenACC Fortran program to solve the 2D sine-Gordon equation Code download
!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear sine-Gordon equation in 2 dimensions
! u_{tt}-u_{xx}-u_{yy}=-sin(u)
! using a second order implicit-explicit time stepping scheme.
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is set in initialdata.f90
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.1415926535...
! Lx = width of box in x direction
! Ly = width of box in y direction
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan (FFTW)
! planbxy = Backward 2d fft plan (FFTW)
! planf = Forward 2d fft plan (CUFFT)
! planb = Backward 2d fft plan (CUFFT)
! dt = timestep
! ierr = error code
! plotnum = number of plot
! .. Arrays ..
! u = approximate solution
! uold = approximate solution
! v = Fourier transform of approximate solution
! vold = Fourier transform of approximate solution
! nonlinhat = Fourier transform of nonlinear term, sin(u)
! temp1 = extra space for energy computation
! temp2 = extra space for energy computation
! savearray = temp array to save out to disk
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! en = total energy
! enstr = strain energy
! enpot = potential energy
! enkin = kinetic energy
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! getgrid.f90 -- Get initial grid of points
! initialdata.f90 -- Get initial data
! enercalc.f90 -- Subroutine to calculate the energy
! savedata.f90 -- Save initial data
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
!
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision

module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleReal *idata,
! cufftDoubleComplex *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleReal *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value :: plan
complex(fp_kind),device:: idata(1:nx,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft

PROGRAM sg2d
USE precision
USE cufft
USE openacc
! Declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=Nx
INTEGER(kind=4), PARAMETER :: Nt=500
INTEGER(kind=4), PARAMETER :: plotgap=Nt+1
REAL(kind=8), PARAMETER :: &
pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER :: Lx=5.0d0
REAL(kind=8), PARAMETER :: Ly=5.0d0
REAL(kind=8) :: dt=0.001d0
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y
REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2,v,vold,nonlinhat
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en
INTEGER(kind=4) :: ierr,i,j,n,allocatestatus
INTEGER(kind=4) :: start, finish, count_rate, plotnum
INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, &
FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64
INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1
INTEGER(kind=8) :: planfxy,planbxy
CHARACTER*100 :: name_config
INTEGER(kind=4) :: planf,planb
! print run information
PRINT *,"Nx=", Nx
PRINT *,"Ny=", Ny
PRINT *,"Nt=", Nt
PRINT *,"Lx=", Lx
PRINT *,"Ly=", Ly
PRINT *,"dt=", dt
ALLOCATE(kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),uold(1:Nx,1:Ny),&
v(1:Nx/2+1,1:Ny),vold(1:Nx/2+1,1:Ny),nonlinhat(1:Nx/2+1,1:Ny),&
savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),&
enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),&
temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=allocatestatus)
IF (allocatestatus .ne. 0) stop
PRINT *,'allocated arrays'
! set up cuda ffts
call cufftPlan2D(planf,nx,ny,CUFFT_D2Z)
call cufftPlan2D(planb,nx,ny,CUFFT_Z2D)
! set up fftw ffts
CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE)
CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE)
PRINT *,'Setup FFTs'
! setup grid, wave numbers
!$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u) !$acc kernels loop
DO i=1,1+Nx/2
kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx
END DO
!$acc end kernels kx(1+Nx/2)=0.0d0 !$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels !$acc kernels loop
DO i=1,Nx
x(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)))*pi*Lx
END DO
!$acc end kernels !$acc kernels loop
DO j=1,1+Ny/2
ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly
END DO
!$acc end kernels ky(1+Ny/2)=0.0d0 !$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels !$acc kernels loop
DO j=1,Ny
y(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)))*pi*Ly
END DO
!$acc end kernels PRINT *,'Got grid and fourier frequencies' !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
u(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
uold(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels savearray=REAL(u) plotnum=1 name_config = 'data/u' ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking PRINT *,'data saved' !$acc end data
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
enpot(plotnum),en(plotnum),kx(1:Nx),ky(1:Ny),temp1,temp2,&
u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
!$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u) call cufftExecD2Z(planf,u,v) call cufftExecD2Z(planf,uold,vold) PRINT *,'Got initial data, starting timestepping' time(plotnum)=0.0d0 CALL system_clock(start,count_rate) DO n=1,Nt !$acc kernels loop
DO j=1,Ny
DO i=1,Nx
uold(i,j)=u(i,j)
u(i,j)=sin(u(i,j))
END DO
END DO
!$acc end kernels call cufftExecD2Z(planf,u,nonlinhat) !$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
nonlinhat(i,j)=( 0.25*(kx(i)*kx(i) + ky(j)*ky(j))&
*(2.0d0*v(i,j)+vold(i,j))+(2.0d0*v(i,j)-vold(i,j))/(dt*dt)&
-nonlinhat(i,j) )/(1/(dt*dt)-0.25*(kx(i)*kx(i) + ky(j)*ky(j)))
vold(i,j)=v(i,j)
v(i,j)=nonlinhat(i,j)
! prescale nonlinhat
nonlinhat(i,j)=nonlinhat(i,j)/REAL(Nx*Ny,kind(0d0))
END DO
END DO
!$acc end kernels call cufftExecZ2D(planb,nonlinhat,u) END DO CALL system_clock(finish,count_rate) !$acc end data
PRINT *,'Finished time stepping'
! compute energy at the end
! savearray=REAL(u(1:Nx,1:Ny),kind(0d0)) ! disabled for benchmarking
! CALL savedata(Nx,Ny,plotnum+1,name_config,savearray)
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
'for Time stepping'
CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))

! Save times at which output was made in text format
PRINT *,'Saved data'

call cufftDestroy(planf)
call cufftDestroy(planb)
PRINT *,'Destroy CUFFT Plans'
call dfftw_destroy_plan_(planbxy)
call dfftw_destroy_plan_(planfxy)
PRINT *,'Destroy FFTW Plans'
DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated host arrays'
PRINT *,'Program execution complete'
END PROGRAM sg2d


## References

B., Cloutier; B.K., Muite; P., Rigge. "Performance of FORTRAN and C GPU Extensions for a Benchmark Suite of Fourier Pseudospectral Algorithms". Proceedings of the Symposium on Application Accelerators in High Performance computing. IEEE.