GPU programs for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and sine Gordon equations

This section includes the programs taken from a conference paper by Cloutier, Muite and Rigge^[1]. The main purpose is to give example programs which show how to use graphics processing units (GPUs) to solve partial differential equations using Fourier methods. For further background on GPUs and programming models for GPUs see Cloutier, Muite and Rigge^[2]. It should be noted that the algorithms used for the sine Gordon equation are very similar to those for the Klein Gordon equation discussed elsewhere in this tutorial. For consistency with the rest of the tutorial, only programs using CUDA Fortran and OpenACC extensions to Fortran are included although Cloutier, Muite and Rigge^[3] also has CUDA C programs. GPUs enable acceleration of Fourier pseudospectral codes by factors of 10 compared to OpenMP parallelizations on a single 8 core node.

2D Navier Stokes Equations

These programs use the Crank-Nicolson method.

( A)

A CUDA Fortran program to solve the 2D Navier-Stokes equations Code download

!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! numthreads = number of CPUs used in calculation
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays (gpu) ..
! omeg_d = vorticity in real space
! omeghat_d = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat_d = 2D Fourier transform of vorticity at previous
! iterate
! nloldhat_d = nonlinear term in Fourier space
! at previous iterate
! psihat_d = 2D Fourier transform of streamfunction
! at next iteration
! temp1_d/temp2_d/temp3_d = reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors (gpu) ..
! kx_d = fourier frequencies in x direction
! ky_d = fourier frequencies in y direction
! x_d = x locations
! y_d = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! This program has not been fully optimized.
!--------------------------------------------------------------------
module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

interface cufftExecD2Z
    subroutine cufftExecD2Z(plan, idata, odata) &
          & bind(C,name='cufftExecD2Z')
       use iso_c_binding
       use precision
       integer(c_int), value :: plan
       real(fp_kind), device :: idata(1:nx,1:ny)
       complex(fp_kind),device :: odata(1:nx,1:ny)
    end subroutine cufftExecD2Z
end interface cufftExecD2Z

interface cufftExecZ2D
    subroutine cufftExecZ2D(plan, idata, odata) &
          & bind(C,name='cufftExecZ2D')
       use iso_c_binding
       use precision
       integer(c_int),value:: plan
       complex(fp_kind),device:: idata(1:nx,1:ny)
       real(fp_kind),device :: odata(1:nx,1:ny)
    end subroutine cufftExecZ2D
end interface cufftExecZ2D

end module cufft

PROGRAM main
use precision
use cufft
! declare variables
    IMPLICIT NONE
    INTEGER(kind=4), PARAMETER :: Nx=4096
INTEGER(kind=4), PARAMETER :: Ny=4096
INTEGER(kind=8) :: temp=10000000
REAL(fp_kind), PARAMETER :: dt=0.000125d0 !dt=0.000002d0
REAL(fp_kind), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Re=1.0d0
REAL(fp_kind), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(fp_kind), PARAMETER :: tol=0.1d0**10
REAL(fp_kind) :: scalemodes,chg
INTEGER(kind=4), PARAMETER :: nplots=1,plotgap=20
REAL(fp_kind),DIMENSION(:), ALLOCATABLE :: x,y
REAL(fp_kind),DIMENSION(:,:), ALLOCATABLE :: omeg,omegexact
INTEGER(kind=4) :: i,j,n,t, AllocateStatus
INTEGER(kind=4) :: planz2d,pland2z, kersize
!variables used for saving data and timing
INTEGER(kind=4) :: start, finish, count_rate,count, iol
CHARACTER*100 :: name_config
! Declare variables for GPU
REAL(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omeg_d,nl_d, temp2_d,&
temp3_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omegoldhat_d, nloldhat_d,&
omeghat_d, nlhat_d, psihat_d,&
temp1_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(kind=8),DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d

kersize=min(Nx,256)
PRINT *,'Program starting'
     PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt
ALLOCATE(x(1:Nx),y(1:Ny),omeg(1:Nx,1:Ny),omegexact(1:Nx,1:Ny),&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'
ALLOCATE(kx_d(1:Nx/2+1),ky_d(1:Ny),x_d(1:Nx),y_d(1:Ny),omeg_d(1:Nx,1:Ny),&
omegoldhat_d(1:Nx/2+1,1:Ny),nloldhat_d(1:Nx/2+1,1:Ny),&
omeghat_d(1:Nx/2+1,1:Ny),nl_d(1:Nx,1:Ny),&
nlhat_d(1:Nx/2+1,1:Ny),psihat_d(1:Nx/2+1,1:Ny),temp1_d(1:Nx/2+1,1:Ny),&
temp2_d(1:Nx,1:Ny),temp3_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'
CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)
PRINT *,'Setup FFTs'

! setup fourier frequencies
!$cuf kernel do <<< *,* >>>
DO i=1,Nx/2+1
kx_d(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind=fp_kind)
END DO
kx_d(1+Nx/2)=0.0d0
!$cuf kernel do <<< *,* >>>
DO i=1,Nx
x_d(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind)
END DO
!$cuf kernel do <<< *,* >>>
DO j=1,Ny/2+1
ky_d(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind=fp_kind)
END DO
  ky_d(1+Ny/2)=0.0d0
   !$cuf kernel do <<< *,* >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind)
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'

!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
omeg_d(i,j)=4.0d0*pi*sin(2.0d0*pi*x_d(i))*sin(2.0d0*pi*y_d(j))!+0.01d0*cos(2.0d0*pi*y_d(j))
END DO
END DO
CALL cufftExecD2Z(pland2z,omeg_d,omeghat_d)

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get initial nonlinear term using omeghat to find psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14)
END DO
END DO

  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u

  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*kx_d(i)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x

  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j)
END DO
END DO

  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v

  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*ky_d(j)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y

!combine to get full nonlinear term in real space
  !$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes
END DO
END DO
CALL cufftExecD2Z(pland2z,nl_d,nlhat_d)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

temp2_d=omeg_d !omegacheck
PRINT *,'Got initial data, starting timestepping'
CALL system_clock(start,count_rate)
DO t=1,nplots
DO n=1,plotgap
chg=1.0d0
nloldhat_d=nlhat_d
omegoldhat_d=omeghat_d
DO WHILE (chg>tol)
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
omeghat_d(i,j)=((dtInv+0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))&
*omegoldhat_d(i,j) - 0.5d0*(nloldhat_d(i,j)+nlhat_d(i,j)))&
/(dtInv-0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))
END DO
END DO
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14)
END DO
END DO
CALL cufftExecZ2D(planz2d,omeghat_d,omeg_d)

!check for convergence
chg=0.0d0
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg_d(i,j)-temp2_d(i,j))*(omeg_d(i,j)-temp2_d(i,j))&
*scalemodes*scalemodes
END DO
END DO

!!!!!!!!!!!!!!!!
!nonlinear term!
!!!!!!!!!!!!!!!!
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*kx_d(i)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j)
END DO
END DO

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v

!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*ky_d(j)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y

!combine to get full nonlinear term in real space
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes
END DO
END DO
CALL cufftExecD2Z(pland2z,nl_d,nlhat_d)
!!!!!!!!!!!!!!!!

temp2_d=omeg_d !save omegacheck
END DO
END DO
!PRINT *, t*plotgap*dt
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),&
'for Time stepping'

! Copy grid back to host
x=x_d
y=y_d
omeg=omeg_d

!get exact omega
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))

temp=temp+1
write(name_config,'(a,i0,a)') 'omega',temp,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)

CALL cufftDestroy(planz2d)
CALL cufftDestroy(pland2z)
PRINT *,'Destroyed fft plan'

DEALLOCATE(x,y,omeg,omegexact,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated CPU memory'

DEALLOCATE(kx_d,ky_d,x_d,y_d,&
omeg_d,omegoldhat_d, nloldhat_d,omeghat_d,&
nl_d, nlhat_d,temp1_d,temp2_d,temp3_d,&
psihat_d,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated GPU memory'
PRINT *,'Program execution complete'
END PROGRAM main

( B)

. An OpenACC Fortran program to solve the 2D Navier-Stokes equations} Code download

!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! scalemodes = 1/(Nx*Ny), scaling after preforming FFTs
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays ..
! omeg = vorticity in real space
! omeghat = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat = 2D Fourier transform of vorticity at previous
! iterate
! nl = nonlinear term
! nlhat = nonlinear term in Fourier space
! nloldhat = nonlinear term in Fourier space
! at previous iterate
! omegexact = taylor-green vorticity at
! at final step
! psihat = 2D Fourier transform of streamfunction
! at next iteration
! temp1/temp2/temp3= reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

interface cufftExecD2Z
    subroutine cufftExecD2Z(plan, idata, odata) &
          & bind(C,name='cufftExecD2Z')
       use iso_c_binding
use precision
integer(c_int), value :: plan
       real(fp_kind), device :: idata(1:nx,1:ny)
       complex(fp_kind),device :: odata(1:nx/2+1,1:ny)
    end subroutine cufftExecD2Z
end interface cufftExecD2Z

interface cufftExecZ2D
    subroutine cufftExecZ2D(plan, idata, odata) &
          & bind(C,name='cufftExecZ2D')
       use iso_c_binding
use precision
integer(c_int),value:: plan
       complex(fp_kind),device:: idata(1:nx/2+1,1:ny)
       real(fp_kind),device :: odata(1:nx,1:ny)
    end subroutine cufftExecZ2D
end interface cufftExecZ2D	
end module cufft


PROGRAM main	
USE precision
USE cufft	
USE openacc

IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=512
INTEGER(kind=4), PARAMETER :: Ny=512
REAL(kind=8), PARAMETER	:: dt=0.000125d0
REAL(kind=8), PARAMETER	:: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(kind=8), PARAMETER	&
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER	:: Re=1.0d0
REAL(kind=8), PARAMETER	::	ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(kind=8), PARAMETER	::	tol=0.1d0**10
REAL(kind=8)	:: scalemodes
REAL(kind=8)	:: chg
INTEGER(kind=4), PARAMETER	::	nplots=1, plotgap=20
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE	:: kx,ky	
REAL(kind=8), DIMENSION(:), ALLOCATABLE	:: x,y,time
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE	:: omeg,nl, temp2, temp3, omegexact
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE	:: omegoldhat, nloldhat,&
omeghat,nlhat, psihat,temp1
INTEGER(kind=4)	:: i,j,n,t, allocatestatus
INTEGER(kind=4)	:: pland2z,planz2d
INTEGER(kind=4)	:: count, iol
CHARACTER*100	:: name_config
INTEGER(kind=4)	:: start, finish, count_rate
INTEGER(kind=4)	:: ierr,vecsize,gangsize
INTEGER(kind=8)	:: planfxy,planbxy

vecsize=32
gangsize=16
   PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt	
ALLOCATE(time(1:nplots+1),kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),&
omeg(1:Nx,1:Ny),omegoldhat(1:Nx/2+1,1:Ny),&
nloldhat(1:Nx/2+1,1:Ny),temp3(1:Nx,1:Ny),omeghat(1:Nx/2+1,1:Ny),&
nl(1:Nx,1:Ny),nlhat(1:Nx/2+1,1:Ny), psihat(1:Nx/2+1,1:Ny),&
temp1(1:Nx/2+1,1:Ny),omegexact(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=AllocateStatus)	
IF (AllocateStatus .ne. 0) STOP
PRINT *,'allocated space'

CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)
 
PRINT *,'Setup 2D FFTs'

! setup fourier frequencies in x-direction
!$acc data copy(kx,ky,x,y,time,temp3,omeg,nl,temp1,temp2,omegoldhat,nloldhat,omeghat,nlhat,psihat)
PRINT *, 'Copied arrays over to device'
!$acc kernels loop
DO i=1,Nx/2+1
kx(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))
END DO
!$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO	
!$acc end kernels
!$acc kernels loop
DO i=1,Nx
x(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0))
END DO
!$acc end kernels
! setup fourier frequencies in y-direction
!$acc kernels loop
DO j=1,Ny/2+1
ky(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))
END DO
!$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,Ny
y(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0))
END DO
!$acc end kernels
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
PRINT *,'Setup grid and fourier frequencies'

!initial data
!$acc kernels loop
DO j=1,Ny
DO i=1,NX
omeg(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*sin(2.0d0*pi*y(j))!+0.01d0*cos(2.0d0*pi*y(j))
END DO
END DO
!$acc end kernels
!\hat{\omega^{n,k}}
CALL cufftExecD2Z(pland2z,omeg,omeghat)

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get initial nonlinear term using omeghat, psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!\hat{\psi^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels
!\omega^{n+1,k+1}
CALL cufftExecZ2D(planz2d,omeghat,omeg)

!get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO	
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !u

! \hat{\omega_x^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels
! \omega_x^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)

! first part nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels

!get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !v

! \hat{\omega_y^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels
! \omega_y^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)

! get the rest of nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels
! transform nonlinear term into fourier space
CALL cufftExecD2Z(pland2z,nl,nlhat)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

!$acc kernels loop
DO j=1,Ny
DO i=1,Nx	
temp2(i,j)=omeg(i,j)	
END DO
END DO
!$acc end kernels

PRINT *,'Got initial data, starting timestepping'	
time(1)=0.0d0
CALL system_clock(start,count_rate)
DO t=1,nplots
DO n=1,plotgap
chg=1.0d0
! save old values(__^{n,k} terms in equation)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
nloldhat(i,j)=nlhat(i,j)
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omegoldhat(i,j)=omeghat(i,j)
END DO
END DO
!$acc end kernels
DO WHILE (chg>tol)	
!Crank-Nicolson timestepping to get \hat{\omega^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omeghat(i,j)=( (dtInv+0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))&
*omegoldhat(i,j) - 0.5d0*(nloldhat(i,j)+nlhat(i,j)))/&
(dtInv-0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,omeghat,omeg)

! check for convergence
chg=0.0d0
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg(i,j)-temp2(i,j))*(omeg(i,j)-temp2(i,j))&
*scalemodes*scalemodes
END DO
END DO
!$acc end kernels

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get nonlinear term using omeghat, psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!\hat{\psi^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels
!\omega^{n+1,k+1}
CALL cufftExecZ2D(planz2d,omeghat,omeg)

!get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO	
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !u

! \hat{\omega_x^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels
! \omega_x^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)

! first part nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels

!get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3)

! \hat{\omega_y^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels
! \omega_y^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)

! get the rest of nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels
! transform nonlinear term into fourier space
CALL cufftExecD2Z(pland2z,nl,nlhat)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

!\omega^{n+1,k+1} is saved for next iteration
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=omeg(i,j)
END DO
END DO
!$acc end kernels
END DO
END DO
time(t+1)=time(t)+dt*plotgap
!PRINT *, time(t+1)
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),&
'for Time stepping'

!get exact omega
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!$acc end kernels
!$acc end data

!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))	

!!!!!!!!!!!!!!!!!!!!!!!!
!copy over data to disk!
!!!!!!!!!!!!!!!!!!!!!!!!
write(name_config,'(a,i0,a)') 'omega',1,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1	
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)

name_config = 'time.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nplots+1
WRITE(11,*) time(i)
END DO
CLOSE(11)	

name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)

name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
!!!!!!!!!!!!!!!!!!!!!!!!

CALL cufftDestroy(pland2z)
CALL cufftDestroy(planz2d)	

DEALLOCATE(time,temp1,temp2,temp3,kx,ky,x,y,&
omeg,omegoldhat,omegexact, nloldhat,&
omeghat,nl, nlhat, psihat,&
stat=AllocateStatus)	
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Program execution complete'	

END PROGRAM main

2D Cubic Nonlinear Schrödinger Equations

These programs use splitting.

( C)

A CUDA Fortran program to solve the 2D Nonlinear Schrödinger equation Code download

	!--------------------------------------------------------------------
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(x=2*L*\pi,y)
! and u(x,y=0)=u(x,y=2*L*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! pi = 3.14159265358979323846264338327950288419716939937510d0
! L = width of box
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! plan = fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! u_d = approximate solution on device
! v_d = Fourier transform of approximate solution on device
! temp1_d = temporary array used to find mass and energy
! temp2_d = temporary array used to find mass and energy
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/calling-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
!
! External libraries required
! cufft -- Cuda FFT library
!

!
! Define the INTERFACE to the NVIDIA CUFFT routines
!

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z

end module cufft

PROGRAM main
use precision
use cufft
! Declare host variables and scalars
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=1024
INTEGER(kind=4), PARAMETER :: Nt=20
INTEGER(kind=4), PARAMETER :: plotgap=5
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Lx=5.0d0
REAL(fp_kind), PARAMETER :: Ly=5.0d0
REAL(fp_kind), PARAMETER :: Es=1.0d0
REAL(fp_kind) :: dt=0.10d0**5
REAL(fp_kind) :: scalemodes
COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x,y
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time
INTEGER(kind=4) :: i,j,k,n,modes,AllocateStatus,plan, kersize
INTEGER(kind=4) :: start, finish, count_rate
   CHARACTER*100 :: name_config
! Declare variables for GPU
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: u_d,v_d,temp1_d,temp2_d
REAL(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: time_d

! start timing
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny
    ! allocate arrays on the host
ALLOCATE(x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),time(1:Nt+1),&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'

! allocate arrays on the device
ALLOCATE(kx_d(1:Nx),ky_d(1:Nx),x_d(1:Nx),y_d(1:Nx),&
u_d(1:Nx,1:Ny),v_d(1:Nx,1:Ny),time_d(1:Nt+1),&
temp1_d(1:Nx,1:Ny),temp2_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'

kersize=min(Nx,256)
! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$cuf kernel do <<< *, * >>>
DO i=1,1+Nx/2
kx_d(i)= cmplx(0.0d0,1.0d0)*(i-1.0d0)/Lx
END DO
kx_d(1+Nx/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO i = 1,Nx/2 -1
kx_d(i+1+Nx/2)=-kx_d(1-i+Nx/2)
END DO
!$cuf kernel do <<< *, * >>>
DO i=1,Nx
x_d(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind))*pi*Lx
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,1+Ny/2
ky_d(j)= cmplx(0.0d0,1.0d0)*(j-1.0d0)/Ly
END DO
ky_d(1+Ny/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind))*pi*Ly
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'

!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(-1.0d0*(x_d(i)**2+y_d(j)**2))
END DO
END DO
! transform initial data
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)

PRINT *,'Got initial data'
! get initial mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InMass=temp2_d(1,1)
! Get initial energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)

! start timing
CALL system_clock(start,count_rate)
! Do first half time step
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*( kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j) )&
*cmplx(0.0d0,0.50d0) )*v_d(i,j)
END DO
END DO

PRINT *,'Starting timestepping'
time(1)=0.0d0
DO n=1,Nt
time_d(n+1)=n*dt
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO

CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,1.0d0))*v_d(i,j)
END DO
END DO

END DO

! transform back final data and do another half time step

CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO

CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,0.50d0))*v_d(i,j)
END DO
END DO
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
! normalize
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=u_d(i,j)*scalemodes
END DO
END DO

CALL system_clock(finish,count_rate)
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),'s for execution'

PRINT *,'Finished time stepping'

! calculate final mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiMass=temp2_d(1,1)

PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind=fp_kind)/REAL(InMass,kind=fp_kind))


! Get final energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)

PRINT*,'Initial energy',InEner
PRINT*,'Final energy',FiEner
PRINT*,'Final Energy/Initial Energy', &
ABS(REAL(FiEner,kind=fp_kind)/REAL(InEner,kind=fp_kind))

! Copy results back to host
u=u_d
time=time_d
x=x_d
y=y_d

name_config = 'ufinal.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
DO i=1,Nx
WRITE(11,*) abs(u(i,j))**2
END DO
END DO
CLOSE(11)

name_config = 'tdata.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,1+Nt/plotgap
WRITE(11,*) time(j)
END DO
CLOSE(11)

name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)

name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
PRINT *,'Saved data'

! Destroy the plan
CALL cufftDestroy(plan)

DEALLOCATE(kx_d,ky_d,x_d,y_d,&
u_d,v_d,time_d,&
temp1_d,temp2_d,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
DEALLOCATE(x,y,u,time,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'deallocated memory'
PRINT *,'Program execution complete'
END PROGRAM main

( D)

An OpenACC Fortran program to solve the 2D Nonlinear Schrödinger equation Code download

!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.14159265358979323846264338327950288419716939937510d0
! Lx = width of box in x direction
! Ly = width of box in y direction
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! numthreads = number of openmp threads
! ierr = error return code
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan
! planbxy = Backward 2d fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! temp1 = temporary field
! temp2 = temporary field
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! precision
! cufft
!
! External libraries required
! CuFFT -- Cuda FFT Library
! OpenACC

!
! Define the INTERFACE to the NVIDIA CUFFT routines
!

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision

module cufft

integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z	
end module cufft

PROGRAM main
USE precision
USE cufft	
USE openacc

! Declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=128
INTEGER(kind=4), PARAMETER :: Ny=128	
INTEGER(kind=4), PARAMETER	:: Nt=20	
INTEGER(kind=4), PARAMETER	:: plotgap=20	
REAL(fp_kind), PARAMETER	:: &
pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER	:: Lx=5.0d0	
REAL(fp_kind), PARAMETER	:: Ly=5.0d0	
REAL(fp_kind), PARAMETER	:: Es=1.0d0	
REAL(fp_kind)	:: dt=0.10d0**5
REAL(fp_kind)	:: scalemodes
COMPLEX(fp_kind)	:: InMass,FiMass,InEner,FiEner
COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE	:: kx	
COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE	:: ky	
REAL(fp_kind), DIMENSION(:), ALLOCATABLE	:: x	
REAL(fp_kind), DIMENSION(:), ALLOCATABLE	:: y	
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE	:: u,v,temp1,temp2
REAL(fp_kind), DIMENSION(:), ALLOCATABLE	:: time
INTEGER(kind=4)	:: i,j,k,n,allocatestatus,ierr, vecsize,gangsize
REAL(fp_kind)	:: start_time,stop_time
INTEGER(kind=4)	:: plan
    CHARACTER*100	:: name_config

vecsize=32
gangsize=16
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny	

ALLOCATE(kx(1:Nx),ky(1:Nx),x(1:Nx),y(1:Nx),u(1:Nx,1:Ny),&
v(1:Nx,1:Ny),temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
time(1:1+Nt/plotgap),stat=allocatestatus)	
IF (allocatestatus .ne. 0) stop
PRINT *,'allocated memory'

!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)

! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'

! setup fourier frequencies
!$acc kernels loop
DO i=1,1+Nx/2
kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx
END DO
!$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels
!$acc kernels loop
   DO i=1,Nx
x(i)=(-1.0d0+2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)) )*pi*Lx
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,1+Ny/2
ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly
END DO
!$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels
!$acc kernels loop
   DO j=1,Ny
y(j)=(-1.0d0+2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)) )*pi*Ly
END DO
!$acc end kernels
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
PRINT *,'Setup grid and fourier frequencies'
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels
! transform initial data
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)

PRINT *,'Got initial data'
! get initial mass
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=abs(u(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
InMass=temp2(1,1)
! Get initial energy
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
InEner=temp2(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=kx(i)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
InEner=InEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=ky(j)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
InEner=InEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
CALL cpu_time(start_time)


! transform initial data and do first half time step
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))&
*cmplx(0.0d0,1.0d0))*v(i,j)
END DO
END DO
!$acc end kernels
PRINT *,'Got initial data, starting timestepping'
time(1)=0.0d0
DO n=1,Nt	
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v(i,j))&
*u(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(dt*(kx(i)*kx(i) + ky(j)*ky(j))&	
*cmplx(0.0d0,1.0d0))*v(i,j)	
END DO
END DO
!$acc end kernels
IF (mod(n,plotgap)==0) then
time(1+n/plotgap)=n*dt
PRINT *,'time',n*dt
END IF
END DO	
! transform back final data and do another half time step
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(cmplx(0,-1)*dt*v(i,j))*u(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))&
*cmplx(0.0d0,1.0d0))*v(i,j)
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=u(i,j)*scalemodes
END DO
END DO	
!$acc end kernels
PRINT *,'Finished time stepping'
CALL cpu_time(stop_time)
!$acc end data
PRINT*,'Program took ',stop_time-start_time,&
'for Time stepping'
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)

! calculate final mass
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=abs(u(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
FiMass=temp2(1,1)


! Get final energy
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
FiEner=temp2(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=kx(i)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
FiEner=FiEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=ky(j)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
FiEner=FiEner+temp1(1,1)

  PRINT *,'Results copied back to host'
PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind(0d0))/REAL(InMass,kind(0d0)))
PRINT*,'Initial energy',InEner
PRINT*,'Final energy',FiEner
PRINT*,'Final Energy/Initial Energy', &
ABS(REAL(FiEner,kind(0d0))/REAL(InEner,kind(0d0)))

name_config = 'ufinal.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
DO i=1,Nx
WRITE(11,*) abs(u(i,j))**2
END DO
END DO
CLOSE(11)

name_config = 'tdata.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,1+Nt/plotgap
WRITE(11,*) time(j)
END DO
CLOSE(11)

name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)

name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
PRINT *,'Saved data'

CALL cufftDestroy(plan)

DEALLOCATE(u,v,temp1,temp2,time,kx,ky,x,y,stat=allocatestatus)	
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated memory'

PRINT *,'Program execution complete'
END PROGRAM main

2D sine-Gordon Equations

These programs use a semi-explicit method that is similar to that used for the Klein-Gordon equation. Only the main program is included here, and the auxiliary subroutines can be downloaded from Cloutier, Muite and Rigge^[4]

( E)

A CUDA Fortran program to solve the 2D sine-Gordon equation Code download

!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear sine-Gordon equation in 2 dimensions
! u_{tt}-u_{xx}-u_{yy}=-sin(u)
! using a second order implicit-explicit time stepping scheme.
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is set in initialdata.f90
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.1415926535...
! Lx = width of box in x direction
! Ly = width of box in y direction
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan (FFTW)
! planbxy = Backward 2d fft plan (FFTW)
! planf = Forward 2d fft plan (CUFFT)
! planb = Backward 2d fft plan (CUFFT)
! dt = timestep
! ierr = error code
! plotnum = number of plot
! .. Arrays ..
! u = approximate solution
! uold = approximate solution
! u_d = approximate solution (on GPU)
! v_d = Fourier transform of approximate solution (on GPU)
! uold_d = approximate solution (on GPU)
! vold_d = Fourier transform of approximate solution (on GPU)
! nonlinhat_d = Fourier transform of nonlinear term, sin(u) (on GPU)
! temp1 = extra space for energy computation
! temp2 = extra space for energy computation
! savearray = temp array to save out to disk
! .. Vectors ..
! kx = Fourier frequencies in x direction
! ky = Fourier frequencies in y direction
! kx_d = Fourier frequencies in x direction (on GPU)
! ky_d = Fourier frequencies in y direction (on GPU)
! x = x locations
! y = y locations
! time = times at which save data
! en = total energy
! enstr = strain energy
! enpot = potential energy
! enkin = kinetic energy
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! getgrid.f90 -- Get initial grid of points
! initialdata.f90 -- Get initial data
! enercalc.f90 -- Subroutine to calculate the energy
! savedata.f90 -- Save initial data
! External libraries required
! Cuda FFT -- http://developer.nvidia.com/cufft
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)

module precision
  ! Precision control
  integer, parameter, public :: Single = kind(0.0) ! Single precision
  integer, parameter, public :: Double = kind(0.0d0) ! Double precision
  !
  integer, parameter, public :: fp_kind = Double
  !integer, parameter, public :: fp_kind = Single
end module precision

module cufft
  integer, public :: CUFFT_FORWARD = -1
  integer, public :: CUFFT_INVERSE = 1
  integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
  integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
  integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
  integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
  integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
  integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftPlan2d
     subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
       use iso_c_binding
       integer(c_int):: plan
       integer(c_int),value:: nx, ny, type
     end subroutine cufftPlan2d
  end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftDestroy(cufftHandle plan)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftDestroy
     subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
       use iso_c_binding
       integer(c_int),value:: plan
     end subroutine cufftDestroy
  end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftExecD2Z(cufftHandle plan,
  ! cufftDoubleReal *idata,
  ! cufftDoubleComplex *odata)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftExecD2Z
     subroutine cufftExecD2Z(plan, idata, odata) &
          & bind(C,name='cufftExecD2Z')
       use iso_c_binding
       use precision
       integer(c_int), value :: plan
       real(fp_kind), device :: idata(1:nx,1:ny)
       complex(fp_kind),device :: odata(1:nx,1:ny)
     end subroutine cufftExecD2Z
  end interface cufftExecD2Z
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftExecD2Z(cufftHandle plan,
  ! cufftDoubleComplex *idata,
  ! cufftDoubleReal *odata)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftExecZ2D
     subroutine cufftExecZ2D(plan, idata, odata) &
          & bind(C,name='cufftExecZ2D')
       use iso_c_binding
       use precision
       integer(c_int),value :: plan
       complex(fp_kind),device:: idata(1:nx,1:ny)
       real(fp_kind),device :: odata(1:nx,1:ny)
     end subroutine cufftExecZ2D
  end interface cufftExecZ2D
end module cufft

PROGRAM sg2d
  USE precision
  USE cudafor
  USE cufft
  ! Declare variables
  IMPLICIT NONE
  INTEGER(kind=4), PARAMETER :: Nx=1024
  INTEGER(kind=4), PARAMETER :: Ny=Nx
  INTEGER(kind=4), PARAMETER :: Nt=500
  INTEGER(kind=4), PARAMETER :: plotgap=Nt+1
  REAL(kind=8), PARAMETER :: &
       pi=3.14159265358979323846264338327950288419716939937510d0
  REAL(kind=8), PARAMETER :: Lx=5.0d0
  REAL(kind=8), PARAMETER :: Ly=5.0d0
  REAL(kind=8) :: dt=0.001d0
  COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
  REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y
  REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold
  COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2
  REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray
  REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en
  REAL(kind=8) :: scalemodes
  INTEGER(kind=4) :: ierr,i,j,n,allocatestatus
  INTEGER(kind=4) :: start, finish, count_rate, plotnum
  INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, &
       FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64
  INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1
  INTEGER(kind=8) :: planfxy,planbxy
  CHARACTER*100 :: name_config
  INTEGER(kind=4) :: planf,planb,kersize
  ! GPU variables
  COMPLEX(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
  COMPLEX(fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: vold_d,v_d,nonlinhat_d
  REAL (fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: uold_d,u_d
  ! print run information
  PRINT *,"Nx=", Nx
  PRINT *,"Ny=", Ny
  PRINT *,"Nt=", Nt
  PRINT *,"Lx=", Lx
  PRINT *,"Ly=", Ly
  PRINT *,"dt=", dt
  kersize=min(Nx,256)
  ALLOCATE(kx(1:Nx),ky(1:Ny),kx_d(1:Nx),ky_d(1:Ny),x(1:Nx),y(1:Ny),&
       u(1:Nx,1:Ny),uold(1:Nx,1:Ny),u_d(1:Nx,1:Ny),uold_d(1:Nx,1:Ny),&
       v_d(1:Nx/2+1,1:Ny),vold_d(1:Nx/2+1,1:Ny),&
       savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),&
       enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),&
       nonlinhat_d(1:Nx/2+1,1:Ny),&
       temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
       stat=allocatestatus)
  IF (allocatestatus .ne. 0) stop
  PRINT *,'allocated arrays'
  scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
  ! set up cuda ffts
  call cufftPlan2D(planf,nx,ny,CUFFT_D2Z)
  call cufftPlan2D(planb,nx,ny,CUFFT_Z2D)
  ! set up fftw ffts
  CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE)
  CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE)
  PRINT *,'Setup FFTs'
  ! setup grid, wave numbers
  CALL getgrid(Nx,Ny,Lx,Ly,pi,name_config,x,y,kx,ky)
  kx_d=kx
  ky_d=ky
  PRINT *,'Got grid and fourier frequencies'

  CALL initialdata(Nx,Ny,x,y,u,uold)
  u_d=u
  uold_d=uold
  plotnum=1
  name_config = 'data/u'
  savearray=REAL(u)
  ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking
  PRINT *,'data saved'

  CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
       enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold)
  call cufftExecD2Z(planf,u_d,v_d)
  call cufftExecD2Z(planf,uold_d,vold_d)
  PRINT *,'Got initial data, starting timestepping'
  time(plotnum)=0.0d0
  CALL system_clock(start,count_rate)
  DO n=1,Nt
     !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
     DO j=1,Ny
        DO i=1,Nx
           uold_d(i,j)=u_d(i,j)
        END DO
     END DO
     !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
     DO j=1,Ny
        DO i=1,Nx
           u_d(i,j)=sin(u_d(i,j))
        END DO
     END DO
     call cufftExecD2Z(planf,u_d,nonlinhat_d)
     !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
     DO j=1,Ny
        DO i=1,Nx/2+1
           nonlinhat_d(i,j)=scalemodes*( 0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
                *(2.0d0*v_d(i,j)+vold_d(i,j))&
                +(2.0d0*v_d(i,j)-vold_d(i,j))/(dt*dt)&
                -nonlinhat_d(i,j) )&
                /(1/(dt*dt)-0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j)))
        END DO
     END DO
     !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
     DO j=1,Ny
        DO i=1,Nx/2+1
           vold_d(i,j)=v_d(i,j)
        END DO
     END DO
     !$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
     DO j=1,Ny
        DO i=1,Nx/2+1
           v_d(i,j)=nonlinhat_d(i,j)/scalemodes
        END DO
     END DO
     call cufftExecZ2D(planb,nonlinhat_d,u_d)
     IF (mod(n,plotgap)==0) then
        plotnum=plotnum+1
        time(plotnum)=n*dt
        PRINT *,'time',n*dt
        u=u_d
        uold=uold_d
        ! savearray=REAL(u,kind(0d0)) ! disabled for benchmarking
        ! CALL savedata(Nx,Ny,plotnum,name_config,savearray)
        CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
             enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold)
     END IF
  END DO
  CALL system_clock(finish,count_rate)
  PRINT *,'Finished time stepping'
  u=u_d
  uold=uold_d
  ! compute energy at the end
  CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
       enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u,uold)

  PRINT*,'Program took ',&
       REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
       'for Time stepping'
  CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
       enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))

  ! Save times at which output was made in text format
  PRINT *,'Saved data'

  call cufftDestroy(planf)
  call cufftDestroy(planb)
  PRINT *,'Destroy CUFFT Plans'
  call dfftw_destroy_plan_(planbxy)
  call dfftw_destroy_plan_(planfxy)
  PRINT *,'Destroy FFTW Plans'
  DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
       stat=allocatestatus)
  IF (allocatestatus .ne. 0) STOP
  PRINT *,'Deallocated host arrays'
  DEALLOCATE(uold_d,vold_d,u_d,v_d,nonlinhat_d,&
       kx_d,ky_d,&
       stat=allocatestatus)
  IF (allocatestatus .ne. 0) STOP
  PRINT *,'Deallocated gpu arrays'
  PRINT *,'Program execution complete'
END PROGRAM sg2d

( F)

An OpenACC Fortran program to solve the 2D sine-Gordon equation Code download

!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear sine-Gordon equation in 2 dimensions
! u_{tt}-u_{xx}-u_{yy}=-sin(u)
! using a second order implicit-explicit time stepping scheme.
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is set in initialdata.f90
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.1415926535...
! Lx = width of box in x direction
! Ly = width of box in y direction
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan (FFTW)
! planbxy = Backward 2d fft plan (FFTW)
! planf = Forward 2d fft plan (CUFFT)
! planb = Backward 2d fft plan (CUFFT)
! dt = timestep
! ierr = error code
! plotnum = number of plot
! .. Arrays ..
! u = approximate solution
! uold = approximate solution
! v = Fourier transform of approximate solution
! vold = Fourier transform of approximate solution
! nonlinhat = Fourier transform of nonlinear term, sin(u)
! temp1 = extra space for energy computation
! temp2 = extra space for energy computation
! savearray = temp array to save out to disk
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! en = total energy
! enstr = strain energy
! enpot = potential energy
! enkin = kinetic energy
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! getgrid.f90 -- Get initial grid of points
! initialdata.f90 -- Get initial data
! enercalc.f90 -- Subroutine to calculate the energy
! savedata.f90 -- Save initial data
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP
module precision
  ! Precision control
  integer, parameter, public :: Single = kind(0.0) ! Single precision
  integer, parameter, public :: Double = kind(0.0d0) ! Double precision
  !
  integer, parameter, public :: fp_kind = Double
  !integer, parameter, public :: fp_kind = Single
end module precision

module cufft
  integer, public :: CUFFT_FORWARD = -1
  integer, public :: CUFFT_INVERSE = 1
  integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
  integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
  integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
  integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
  integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
  integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftPlan2d
     subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
       use iso_c_binding
integer(c_int):: plan
       integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
  end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftDestroy(cufftHandle plan)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftDestroy
     subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
       use iso_c_binding
integer(c_int),value:: plan
     end subroutine cufftDestroy
  end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftExecD2Z(cufftHandle plan,
  ! cufftDoubleReal *idata,
  ! cufftDoubleComplex *odata)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftExecD2Z
     subroutine cufftExecD2Z(plan, idata, odata) &
          & bind(C,name='cufftExecD2Z')
       use iso_c_binding
use precision
integer(c_int), value :: plan
       real(fp_kind), device :: idata(1:nx,1:ny)
       complex(fp_kind),device :: odata(1:nx,1:ny)
     end subroutine cufftExecD2Z
  end interface cufftExecD2Z
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  !
  ! cufftExecD2Z(cufftHandle plan,
  ! cufftDoubleComplex *idata,
  ! cufftDoubleReal *odata)
  !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  interface cufftExecZ2D
     subroutine cufftExecZ2D(plan, idata, odata) &
          & bind(C,name='cufftExecZ2D')
       use iso_c_binding
use precision
integer(c_int),value :: plan
       complex(fp_kind),device:: idata(1:nx,1:ny)
       real(fp_kind),device :: odata(1:nx,1:ny)
     end subroutine cufftExecZ2D
  end interface cufftExecZ2D
end module cufft

PROGRAM sg2d
  USE precision
USE cufft
  USE openacc
  ! Declare variables
  IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
  INTEGER(kind=4), PARAMETER :: Ny=Nx
  INTEGER(kind=4), PARAMETER :: Nt=500
  INTEGER(kind=4), PARAMETER :: plotgap=Nt+1
  REAL(kind=8), PARAMETER :: &
       pi=3.14159265358979323846264338327950288419716939937510d0
  REAL(kind=8), PARAMETER :: Lx=5.0d0
  REAL(kind=8), PARAMETER :: Ly=5.0d0
  REAL(kind=8) :: dt=0.001d0
  COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
  REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y
  REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold
  COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2,v,vold,nonlinhat
  REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray
  REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en
  INTEGER(kind=4) :: ierr,i,j,n,allocatestatus
  INTEGER(kind=4) :: start, finish, count_rate, plotnum
  INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, &
       FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64
  INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1
  INTEGER(kind=8) :: planfxy,planbxy
  CHARACTER*100 :: name_config
  INTEGER(kind=4) :: planf,planb
  ! print run information
  PRINT *,"Nx=", Nx
  PRINT *,"Ny=", Ny
  PRINT *,"Nt=", Nt
  PRINT *,"Lx=", Lx
  PRINT *,"Ly=", Ly
  PRINT *,"dt=", dt
  ALLOCATE(kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),uold(1:Nx,1:Ny),&
       v(1:Nx/2+1,1:Ny),vold(1:Nx/2+1,1:Ny),nonlinhat(1:Nx/2+1,1:Ny),&
       savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),&
       enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),&
       temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
       stat=allocatestatus)
  IF (allocatestatus .ne. 0) stop
PRINT *,'allocated arrays'
  ! set up cuda ffts
  call cufftPlan2D(planf,nx,ny,CUFFT_D2Z)
  call cufftPlan2D(planb,nx,ny,CUFFT_Z2D)
  ! set up fftw ffts
  CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE)
  CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE)
  PRINT *,'Setup FFTs'
  ! setup grid, wave numbers
  !$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u)
  !$acc kernels loop
  DO i=1,1+Nx/2
     kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx
  END DO
  !$acc end kernels
  kx(1+Nx/2)=0.0d0
  !$acc kernels loop
  DO i = 1,Nx/2 -1
     kx(i+1+Nx/2)=-kx(1-i+Nx/2)
  END DO
  !$acc end kernels
  !$acc kernels loop
  DO i=1,Nx
     x(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)))*pi*Lx
  END DO
  !$acc end kernels
  !$acc kernels loop
  DO j=1,1+Ny/2
     ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly
  END DO
  !$acc end kernels
  ky(1+Ny/2)=0.0d0
  !$acc kernels loop
  DO j = 1,Ny/2 -1
     ky(j+1+Ny/2)=-ky(1-j+Ny/2)
  END DO
  !$acc end kernels
  !$acc kernels loop
  DO j=1,Ny
     y(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)))*pi*Ly
  END DO
  !$acc end kernels
  PRINT *,'Got grid and fourier frequencies'
  !$acc kernels loop
  DO j=1,Ny
    DO i=1,Nx
        u(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
    END DO
END DO
  !$acc end kernels
  !$acc kernels loop
  DO j=1,Ny
    DO i=1,Nx
      uold(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
    END DO
END DO
  !$acc end kernels
  savearray=REAL(u)
  plotnum=1
  name_config = 'data/u'
  ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking
  PRINT *,'data saved'
  !$acc end data
  CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
       enpot(plotnum),en(plotnum),kx(1:Nx),ky(1:Ny),temp1,temp2,&
       u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
  !$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u)
  call cufftExecD2Z(planf,u,v)
  call cufftExecD2Z(planf,uold,vold)
  PRINT *,'Got initial data, starting timestepping'
  time(plotnum)=0.0d0
  CALL system_clock(start,count_rate)
  DO n=1,Nt
     !$acc kernels loop
     DO j=1,Ny
        DO i=1,Nx
           uold(i,j)=u(i,j)
           u(i,j)=sin(u(i,j))
        END DO
END DO
     !$acc end kernels
     call cufftExecD2Z(planf,u,nonlinhat)
     !$acc kernels loop
     DO j=1,Ny
        DO i=1,Nx/2+1
           nonlinhat(i,j)=( 0.25*(kx(i)*kx(i) + ky(j)*ky(j))&
                *(2.0d0*v(i,j)+vold(i,j))+(2.0d0*v(i,j)-vold(i,j))/(dt*dt)&
                -nonlinhat(i,j) )/(1/(dt*dt)-0.25*(kx(i)*kx(i) + ky(j)*ky(j)))
           vold(i,j)=v(i,j)
           v(i,j)=nonlinhat(i,j)
           ! prescale nonlinhat
           nonlinhat(i,j)=nonlinhat(i,j)/REAL(Nx*Ny,kind(0d0))
        END DO
END DO
     !$acc end kernels
     call cufftExecZ2D(planb,nonlinhat,u)
  END DO
CALL system_clock(finish,count_rate)
  !$acc end data
  PRINT *,'Finished time stepping'
  ! compute energy at the end
  ! savearray=REAL(u(1:Nx,1:Ny),kind(0d0)) ! disabled for benchmarking
  ! CALL savedata(Nx,Ny,plotnum+1,name_config,savearray)
  CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
       enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
  PRINT*,'Program took ',&
       REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
       'for Time stepping'
  CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
       enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))

  ! Save times at which output was made in text format
  PRINT *,'Saved data'

  call cufftDestroy(planf)
  call cufftDestroy(planb)
  PRINT *,'Destroy CUFFT Plans'
  call dfftw_destroy_plan_(planbxy)
  call dfftw_destroy_plan_(planfxy)
  PRINT *,'Destroy FFTW Plans'
  DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
       stat=allocatestatus)
  IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated host arrays'
  PRINT *,'Program execution complete'
END PROGRAM sg2d

References

B., Cloutier; B.K., Muite; P., Rigge. "Performance of FORTRAN and C GPU Extensions for a Benchmark Suite of Fourier Pseudospectral Algorithms". Proceedings of the Symposium on Application Accelerators in High Performance computing. IEEE. http://arxiv.org/abs/1206.3215.

[1] Cloutier, Muite and Rigge (2012)

[2] Cloutier, Muite and Rigge (2012)

[3] Cloutier, Muite and Rigge (2012)

[4] Cloutier, Muite and Rigge (2012)

[1]

[2]

[3]

[4]