Parallel Spectral Numerical Methods/GPU programs for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrodinger and sine Gordon equations
GPU programs for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and sine Gordon equations
[edit | edit source]This section includes the programs taken from a conference paper by Cloutier, Muite and Rigge[1]. The main purpose is to give example programs which show how to use graphics processing units (GPUs) to solve partial differential equations using Fourier methods. For further background on GPUs and programming models for GPUs see Cloutier, Muite and Rigge[2]. It should be noted that the algorithms used for the sine Gordon equation are very similar to those for the Klein Gordon equation discussed elsewhere in this tutorial. For consistency with the rest of the tutorial, only programs using CUDA Fortran and OpenACC extensions to Fortran are included although Cloutier, Muite and Rigge[3] also has CUDA C programs. GPUs enable acceleration of Fourier pseudospectral codes by factors of 10 compared to OpenMP parallelizations on a single 8 core node.
2D Navier Stokes Equations
[edit | edit source]These programs use the Crank-Nicolson method.
|
(
) |
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! numthreads = number of CPUs used in calculation
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays (gpu) ..
! omeg_d = vorticity in real space
! omeghat_d = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat_d = 2D Fourier transform of vorticity at previous
! iterate
! nloldhat_d = nonlinear term in Fourier space
! at previous iterate
! psihat_d = 2D Fourier transform of streamfunction
! at next iteration
! temp1_d/temp2_d/temp3_d = reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors (gpu) ..
! kx_d = fourier frequencies in x direction
! ky_d = fourier frequencies in y direction
! x_d = x locations
! y_d = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! This program has not been fully optimized.
!--------------------------------------------------------------------
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z
interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value:: plan
complex(fp_kind),device:: idata(1:nx,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft
PROGRAM main
use precision
use cufft
! declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=4096
INTEGER(kind=4), PARAMETER :: Ny=4096
INTEGER(kind=8) :: temp=10000000
REAL(fp_kind), PARAMETER :: dt=0.000125d0 !dt=0.000002d0
REAL(fp_kind), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Re=1.0d0
REAL(fp_kind), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(fp_kind), PARAMETER :: tol=0.1d0**10
REAL(fp_kind) :: scalemodes,chg
INTEGER(kind=4), PARAMETER :: nplots=1,plotgap=20
REAL(fp_kind),DIMENSION(:), ALLOCATABLE :: x,y
REAL(fp_kind),DIMENSION(:,:), ALLOCATABLE :: omeg,omegexact
INTEGER(kind=4) :: i,j,n,t, AllocateStatus
INTEGER(kind=4) :: planz2d,pland2z, kersize
!variables used for saving data and timing
INTEGER(kind=4) :: start, finish, count_rate,count, iol
CHARACTER*100 :: name_config
! Declare variables for GPU
REAL(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omeg_d,nl_d, temp2_d,&
temp3_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omegoldhat_d, nloldhat_d,&
omeghat_d, nlhat_d, psihat_d,&
temp1_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(kind=8),DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d
kersize=min(Nx,256)
PRINT *,'Program starting'
PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt
ALLOCATE(x(1:Nx),y(1:Ny),omeg(1:Nx,1:Ny),omegexact(1:Nx,1:Ny),&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'
ALLOCATE(kx_d(1:Nx/2+1),ky_d(1:Ny),x_d(1:Nx),y_d(1:Ny),omeg_d(1:Nx,1:Ny),&
omegoldhat_d(1:Nx/2+1,1:Ny),nloldhat_d(1:Nx/2+1,1:Ny),&
omeghat_d(1:Nx/2+1,1:Ny),nl_d(1:Nx,1:Ny),&
nlhat_d(1:Nx/2+1,1:Ny),psihat_d(1:Nx/2+1,1:Ny),temp1_d(1:Nx/2+1,1:Ny),&
temp2_d(1:Nx,1:Ny),temp3_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'
CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$cuf kernel do <<< *,* >>>
DO i=1,Nx/2+1
kx_d(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind=fp_kind)
END DO
kx_d(1+Nx/2)=0.0d0
!$cuf kernel do <<< *,* >>>
DO i=1,Nx
x_d(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind)
END DO
!$cuf kernel do <<< *,* >>>
DO j=1,Ny/2+1
ky_d(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind=fp_kind)
END DO
ky_d(1+Ny/2)=0.0d0
!$cuf kernel do <<< *,* >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind)
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
omeg_d(i,j)=4.0d0*pi*sin(2.0d0*pi*x_d(i))*sin(2.0d0*pi*y_d(j))!+0.01d0*cos(2.0d0*pi*y_d(j))
END DO
END DO
CALL cufftExecD2Z(pland2z,omeg_d,omeghat_d)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get initial nonlinear term using omeghat to find psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14)
END DO
END DO
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*kx_d(i)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j)
END DO
END DO
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*ky_d(j)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y
!combine to get full nonlinear term in real space
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes
END DO
END DO
CALL cufftExecD2Z(pland2z,nl_d,nlhat_d)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
temp2_d=omeg_d !omegacheck
PRINT *,'Got initial data, starting timestepping'
CALL system_clock(start,count_rate)
DO t=1,nplots
DO n=1,plotgap
chg=1.0d0
nloldhat_d=nlhat_d
omegoldhat_d=omeghat_d
DO WHILE (chg>tol)
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
omeghat_d(i,j)=((dtInv+0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))&
*omegoldhat_d(i,j) - 0.5d0*(nloldhat_d(i,j)+nlhat_d(i,j)))&
/(dtInv-0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))
END DO
END DO
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14)
END DO
END DO
CALL cufftExecZ2D(planz2d,omeghat_d,omeg_d)
!check for convergence
chg=0.0d0
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg_d(i,j)-temp2_d(i,j))*(omeg_d(i,j)-temp2_d(i,j))&
*scalemodes*scalemodes
END DO
END DO
!!!!!!!!!!!!!!!!
!nonlinear term!
!!!!!!!!!!!!!!!!
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*kx_d(i)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j)
END DO
END DO
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
temp1_d(i,j)=omeghat_d(i,j)*ky_d(j)
END DO
END DO
CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y
!combine to get full nonlinear term in real space
!$cuf kernel do(2) <<< (2,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes
END DO
END DO
CALL cufftExecD2Z(pland2z,nl_d,nlhat_d)
!!!!!!!!!!!!!!!!
temp2_d=omeg_d !save omegacheck
END DO
END DO
!PRINT *, t*plotgap*dt
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),&
'for Time stepping'
! Copy grid back to host
x=x_d
y=y_d
omeg=omeg_d
!get exact omega
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))
temp=temp+1
write(name_config,'(a,i0,a)') 'omega',temp,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)
CALL cufftDestroy(planz2d)
CALL cufftDestroy(pland2z)
PRINT *,'Destroyed fft plan'
DEALLOCATE(x,y,omeg,omegexact,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated CPU memory'
DEALLOCATE(kx_d,ky_d,x_d,y_d,&
omeg_d,omegoldhat_d, nloldhat_d,omeghat_d,&
nl_d, nlhat_d,temp1_d,temp2_d,temp3_d,&
psihat_d,stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Deallocated GPU memory'
PRINT *,'Program execution complete'
END PROGRAM main
|
(
) |
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program numerically solves the 2D incompressible Navier-Stokes
! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and
! Crank-Nicolson timestepping. The numerical solution is compared to
! the exact Taylor-Green Vortex Solution.
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! Periodic free-slip boundary conditions and Initial conditions:
! u(x,y,0)=sin(2*pi*x)cos(2*pi*y)
! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y)
! Analytical Solution (subscript denote derivatives):
! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re)
! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re)
! omega=v_x-u_y
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! nplots = number of plots produced
! plotgap = number of timesteps inbetween plots
! Re = dimensionless Renold's number
! ReInv = 1/Re for optimization
! dt = timestep size
! dtInv = 1/dt for optimization
! tol = determines when convergences is reached
! scalemodes = 1/(Nx*Ny), scaling after preforming FFTs
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! time = times at which data is saved
! chg = error at each iteration
! .. Arrays ..
! omeg = vorticity in real space
! omeghat = 2D Fourier transform of vorticity
! at next iterate
! omegoldhat = 2D Fourier transform of vorticity at previous
! iterate
! nl = nonlinear term
! nlhat = nonlinear term in Fourier space
! nloldhat = nonlinear term in Fourier space
! at previous iterate
! omegexact = taylor-green vorticity at
! at final step
! psihat = 2D Fourier transform of streamfunction
! at next iteration
! temp1/temp2/temp3= reusable complex/real space used for
! calculations. This reduces number of
! arrays stored.
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! name_config = array to store filename for data to be saved
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx/2+1,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z
interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value:: plan
complex(fp_kind),device:: idata(1:nx/2+1,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft
PROGRAM main
USE precision
USE cufft
USE openacc
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=512
INTEGER(kind=4), PARAMETER :: Ny=512
REAL(kind=8), PARAMETER :: dt=0.000125d0
REAL(kind=8), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0))
REAL(kind=8), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER :: Re=1.0d0
REAL(kind=8), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0))
REAL(kind=8), PARAMETER :: tol=0.1d0**10
REAL(kind=8) :: scalemodes
REAL(kind=8) :: chg
INTEGER(kind=4), PARAMETER :: nplots=1, plotgap=20
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y,time
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: omeg,nl, temp2, temp3, omegexact
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: omegoldhat, nloldhat,&
omeghat,nlhat, psihat,temp1
INTEGER(kind=4) :: i,j,n,t, allocatestatus
INTEGER(kind=4) :: pland2z,planz2d
INTEGER(kind=4) :: count, iol
CHARACTER*100 :: name_config
INTEGER(kind=4) :: start, finish, count_rate
INTEGER(kind=4) :: ierr,vecsize,gangsize
INTEGER(kind=8) :: planfxy,planbxy
vecsize=32
gangsize=16
PRINT *,'Grid:',Nx,'X',Ny
PRINT *,'dt:',dt
ALLOCATE(time(1:nplots+1),kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),&
omeg(1:Nx,1:Ny),omegoldhat(1:Nx/2+1,1:Ny),&
nloldhat(1:Nx/2+1,1:Ny),temp3(1:Nx,1:Ny),omeghat(1:Nx/2+1,1:Ny),&
nl(1:Nx,1:Ny),nlhat(1:Nx/2+1,1:Ny), psihat(1:Nx/2+1,1:Ny),&
temp1(1:Nx/2+1,1:Ny),omegexact(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'allocated space'
CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z)
CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D)
PRINT *,'Setup 2D FFTs'
! setup fourier frequencies in x-direction
!$acc data copy(kx,ky,x,y,time,temp3,omeg,nl,temp1,temp2,omegoldhat,nloldhat,omeghat,nlhat,psihat)
PRINT *, 'Copied arrays over to device'
!$acc kernels loop
DO i=1,Nx/2+1
kx(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))
END DO
!$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels
!$acc kernels loop
DO i=1,Nx
x(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0))
END DO
!$acc end kernels
! setup fourier frequencies in y-direction
!$acc kernels loop
DO j=1,Ny/2+1
ky(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))
END DO
!$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,Ny
y(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0))
END DO
!$acc end kernels
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
PRINT *,'Setup grid and fourier frequencies'
!initial data
!$acc kernels loop
DO j=1,Ny
DO i=1,NX
omeg(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*sin(2.0d0*pi*y(j))!+0.01d0*cos(2.0d0*pi*y(j))
END DO
END DO
!$acc end kernels
!\hat{\omega^{n,k}}
CALL cufftExecD2Z(pland2z,omeg,omeghat)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get initial nonlinear term using omeghat, psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!\hat{\psi^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels
!\omega^{n+1,k+1}
CALL cufftExecZ2D(planz2d,omeghat,omeg)
!get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !u
! \hat{\omega_x^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels
! \omega_x^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)
! first part nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels
!get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !v
! \hat{\omega_y^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels
! \omega_y^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)
! get the rest of nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels
! transform nonlinear term into fourier space
CALL cufftExecD2Z(pland2z,nl,nlhat)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=omeg(i,j)
END DO
END DO
!$acc end kernels
PRINT *,'Got initial data, starting timestepping'
time(1)=0.0d0
CALL system_clock(start,count_rate)
DO t=1,nplots
DO n=1,plotgap
chg=1.0d0
! save old values(__^{n,k} terms in equation)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
nloldhat(i,j)=nlhat(i,j)
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omegoldhat(i,j)=omeghat(i,j)
END DO
END DO
!$acc end kernels
DO WHILE (chg>tol)
!Crank-Nicolson timestepping to get \hat{\omega^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
omeghat(i,j)=( (dtInv+0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))&
*omegoldhat(i,j) - 0.5d0*(nloldhat(i,j)+nlhat(i,j)))/&
(dtInv-0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,omeghat,omeg)
! check for convergence
chg=0.0d0
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
chg=chg+(omeg(i,j)-temp2(i,j))*(omeg(i,j)-temp2(i,j))&
*scalemodes*scalemodes
END DO
END DO
!$acc end kernels
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!get nonlinear term using omeghat, psihat, u, and v!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!\hat{\psi^{n+1,k+1}}
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14)
END DO
END DO
!$acc end kernels
!\omega^{n+1,k+1}
CALL cufftExecZ2D(planz2d,omeghat,omeg)
!get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=psihat(i,j)*ky(j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3) !u
! \hat{\omega_x^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*kx(i)
END DO
END DO
!$acc end kernels
! \omega_x^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)
! first part nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=temp3(i,j)*temp2(i,j)
END DO
END DO
!$acc end kernels
!get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2D(planz2d,temp1,temp3)
! \hat{\omega_y^{n,k}}
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
temp1(i,j)=omeghat(i,j)*ky(j)
END DO
END DO
!$acc end kernels
! \omega_y^{n,k}
CALL cufftExecZ2D(planz2d,temp1,temp2)
! get the rest of nonlinear term in real space
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes
END DO
END DO
!$acc end kernels
! transform nonlinear term into fourier space
CALL cufftExecD2Z(pland2z,nl,nlhat)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!\omega^{n+1,k+1} is saved for next iteration
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=omeg(i,j)
END DO
END DO
!$acc end kernels
END DO
END DO
time(t+1)=time(t)+dt*plotgap
!PRINT *, time(t+1)
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),&
'for Time stepping'
!get exact omega
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*&
sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt)
END DO
END DO
!$acc end kernels
!$acc end data
!compute max error
PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact))
!!!!!!!!!!!!!!!!!!!!!!!!
!copy over data to disk!
!!!!!!!!!!!!!!!!!!!!!!!!
write(name_config,'(a,i0,a)') 'omega',1,'.datbin'
INQUIRE(iolength=iol) omeg(1,1)
OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol)
count = 1
DO j=1,Ny
DO i=1,Nx
WRITE(11,rec=count) omeg(i,j)*scalemodes
count=count+1
END DO
END DO
CLOSE(11)
name_config = 'time.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nplots+1
WRITE(11,*) time(i)
END DO
CLOSE(11)
name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)
name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
!!!!!!!!!!!!!!!!!!!!!!!!
CALL cufftDestroy(pland2z)
CALL cufftDestroy(planz2d)
DEALLOCATE(time,temp1,temp2,temp3,kx,ky,x,y,&
omeg,omegoldhat,omegexact, nloldhat,&
omeghat,nl, nlhat, psihat,&
stat=AllocateStatus)
IF (AllocateStatus .ne. 0) STOP
PRINT *,'Program execution complete'
END PROGRAM main
2D Cubic Nonlinear Schrödinger Equations
[edit | edit source]These programs use splitting.
|
(
) |
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(x=2*L*\pi,y)
! and u(x,y=0)=u(x,y=2*L*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! pi = 3.14159265358979323846264338327950288419716939937510d0
! L = width of box
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! plan = fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! u_d = approximate solution on device
! v_d = Fourier transform of approximate solution on device
! temp1_d = temporary array used to find mass and energy
! temp2_d = temporary array used to find mass and energy
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/calling-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
!
! External libraries required
! cufft -- Cuda FFT library
!
!
! Define the INTERFACE to the NVIDIA CUFFT routines
!
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z
end module cufft
PROGRAM main
use precision
use cufft
! Declare host variables and scalars
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=1024
INTEGER(kind=4), PARAMETER :: Nt=20
INTEGER(kind=4), PARAMETER :: plotgap=5
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Lx=5.0d0
REAL(fp_kind), PARAMETER :: Ly=5.0d0
REAL(fp_kind), PARAMETER :: Es=1.0d0
REAL(fp_kind) :: dt=0.10d0**5
REAL(fp_kind) :: scalemodes
COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x,y
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time
INTEGER(kind=4) :: i,j,k,n,modes,AllocateStatus,plan, kersize
INTEGER(kind=4) :: start, finish, count_rate
CHARACTER*100 :: name_config
! Declare variables for GPU
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: u_d,v_d,temp1_d,temp2_d
REAL(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: time_d
! start timing
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny
! allocate arrays on the host
ALLOCATE(x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),time(1:Nt+1),&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'
! allocate arrays on the device
ALLOCATE(kx_d(1:Nx),ky_d(1:Nx),x_d(1:Nx),y_d(1:Nx),&
u_d(1:Nx,1:Ny),v_d(1:Nx,1:Ny),time_d(1:Nt+1),&
temp1_d(1:Nx,1:Ny),temp2_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'
kersize=min(Nx,256)
! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$cuf kernel do <<< *, * >>>
DO i=1,1+Nx/2
kx_d(i)= cmplx(0.0d0,1.0d0)*(i-1.0d0)/Lx
END DO
kx_d(1+Nx/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO i = 1,Nx/2 -1
kx_d(i+1+Nx/2)=-kx_d(1-i+Nx/2)
END DO
!$cuf kernel do <<< *, * >>>
DO i=1,Nx
x_d(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind))*pi*Lx
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,1+Ny/2
ky_d(j)= cmplx(0.0d0,1.0d0)*(j-1.0d0)/Ly
END DO
ky_d(1+Ny/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind))*pi*Ly
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(-1.0d0*(x_d(i)**2+y_d(j)**2))
END DO
END DO
! transform initial data
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
PRINT *,'Got initial data'
! get initial mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InMass=temp2_d(1,1)
! Get initial energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)
! start timing
CALL system_clock(start,count_rate)
! Do first half time step
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*( kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j) )&
*cmplx(0.0d0,0.50d0) )*v_d(i,j)
END DO
END DO
PRINT *,'Starting timestepping'
time(1)=0.0d0
DO n=1,Nt
time_d(n+1)=n*dt
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,1.0d0))*v_d(i,j)
END DO
END DO
END DO
! transform back final data and do another half time step
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,0.50d0))*v_d(i,j)
END DO
END DO
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
! normalize
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=u_d(i,j)*scalemodes
END DO
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),'s for execution'
PRINT *,'Finished time stepping'
! calculate final mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiMass=temp2_d(1,1)
PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind=fp_kind)/REAL(InMass,kind=fp_kind))
! Get final energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)
PRINT*,'Initial energy',InEner
PRINT*,'Final energy',FiEner
PRINT*,'Final Energy/Initial Energy', &
ABS(REAL(FiEner,kind=fp_kind)/REAL(InEner,kind=fp_kind))
! Copy results back to host
u=u_d
time=time_d
x=x_d
y=y_d
name_config = 'ufinal.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
DO i=1,Nx
WRITE(11,*) abs(u(i,j))**2
END DO
END DO
CLOSE(11)
name_config = 'tdata.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,1+Nt/plotgap
WRITE(11,*) time(j)
END DO
CLOSE(11)
name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)
name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
PRINT *,'Saved data'
! Destroy the plan
CALL cufftDestroy(plan)
DEALLOCATE(kx_d,ky_d,x_d,y_d,&
u_d,v_d,time_d,&
temp1_d,temp2_d,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
DEALLOCATE(x,y,u,time,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'deallocated memory'
PRINT *,'Program execution complete'
END PROGRAM main
|
(
) |
!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.14159265358979323846264338327950288419716939937510d0
! Lx = width of box in x direction
! Ly = width of box in y direction
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! numthreads = number of openmp threads
! ierr = error return code
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan
! planbxy = Backward 2d fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! temp1 = temporary field
! temp2 = temporary field
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACKNOWLEDGEMENTS
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! precision
! cufft
!
! External libraries required
! CuFFT -- Cuda FFT Library
! OpenACC
!
! Define the INTERFACE to the NVIDIA CUFFT routines
!
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z
end module cufft
PROGRAM main
USE precision
USE cufft
USE openacc
! Declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=128
INTEGER(kind=4), PARAMETER :: Ny=128
INTEGER(kind=4), PARAMETER :: Nt=20
INTEGER(kind=4), PARAMETER :: plotgap=20
REAL(fp_kind), PARAMETER :: &
pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Lx=5.0d0
REAL(fp_kind), PARAMETER :: Ly=5.0d0
REAL(fp_kind), PARAMETER :: Es=1.0d0
REAL(fp_kind) :: dt=0.10d0**5
REAL(fp_kind) :: scalemodes
COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner
COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: kx
COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: ky
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: y
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u,v,temp1,temp2
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time
INTEGER(kind=4) :: i,j,k,n,allocatestatus,ierr, vecsize,gangsize
REAL(fp_kind) :: start_time,stop_time
INTEGER(kind=4) :: plan
CHARACTER*100 :: name_config
vecsize=32
gangsize=16
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny
ALLOCATE(kx(1:Nx),ky(1:Nx),x(1:Nx),y(1:Nx),u(1:Nx,1:Ny),&
v(1:Nx,1:Ny),temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
time(1:1+Nt/plotgap),stat=allocatestatus)
IF (allocatestatus .ne. 0) stop
PRINT *,'allocated memory'
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$acc kernels loop
DO i=1,1+Nx/2
kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx
END DO
!$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels
!$acc kernels loop
DO i=1,Nx
x(i)=(-1.0d0+2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)) )*pi*Lx
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,1+Ny/2
ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly
END DO
!$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,Ny
y(j)=(-1.0d0+2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)) )*pi*Ly
END DO
!$acc end kernels
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
PRINT *,'Setup grid and fourier frequencies'
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels
! transform initial data
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
PRINT *,'Got initial data'
! get initial mass
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=abs(u(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
InMass=temp2(1,1)
! Get initial energy
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
InEner=temp2(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=kx(i)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
InEner=InEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=ky(j)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
InEner=InEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
CALL cpu_time(start_time)
! transform initial data and do first half time step
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))&
*cmplx(0.0d0,1.0d0))*v(i,j)
END DO
END DO
!$acc end kernels
PRINT *,'Got initial data, starting timestepping'
time(1)=0.0d0
DO n=1,Nt
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v(i,j))&
*u(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(dt*(kx(i)*kx(i) + ky(j)*ky(j))&
*cmplx(0.0d0,1.0d0))*v(i,j)
END DO
END DO
!$acc end kernels
IF (mod(n,plotgap)==0) then
time(1+n/plotgap)=n*dt
PRINT *,'time',n*dt
END IF
END DO
! transform back final data and do another half time step
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2
END DO
END DO
!$acc end kernels
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=exp(cmplx(0,-1)*dt*v(i,j))*u(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))&
*cmplx(0.0d0,1.0d0))*v(i,j)
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE)
!$acc kernels loop gang(gangsize), vector(vecsize)
DO j=1,Ny
DO i=1,Nx
u(i,j)=u(i,j)*scalemodes
END DO
END DO
!$acc end kernels
PRINT *,'Finished time stepping'
CALL cpu_time(stop_time)
!$acc end data
PRINT*,'Program took ',stop_time-start_time,&
'for Time stepping'
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
! calculate final mass
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=abs(u(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
FiMass=temp2(1,1)
! Get final energy
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD)
!$acc end data
FiEner=temp2(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=kx(i)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
FiEner=FiEner+temp1(1,1)
!$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=ky(j)*v(i,j)*scalemodes
END DO
END DO
!$acc end kernels
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
temp2(i,j)=0.5d0*abs(temp1(i,j))**2
END DO
END DO
!$acc end kernels
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD)
!$acc end data
FiEner=FiEner+temp1(1,1)
PRINT *,'Results copied back to host'
PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind(0d0))/REAL(InMass,kind(0d0)))
PRINT*,'Initial energy',InEner
PRINT*,'Final energy',FiEner
PRINT*,'Final Energy/Initial Energy', &
ABS(REAL(FiEner,kind(0d0))/REAL(InEner,kind(0d0)))
name_config = 'ufinal.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
DO i=1,Nx
WRITE(11,*) abs(u(i,j))**2
END DO
END DO
CLOSE(11)
name_config = 'tdata.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,1+Nt/plotgap
WRITE(11,*) time(j)
END DO
CLOSE(11)
name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)
name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
PRINT *,'Saved data'
CALL cufftDestroy(plan)
DEALLOCATE(u,v,temp1,temp2,time,kx,ky,x,y,stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated memory'
PRINT *,'Program execution complete'
END PROGRAM main
2D sine-Gordon Equations
[edit | edit source]These programs use a semi-explicit method that is similar to that used for the Klein-Gordon equation. Only the main program is included here, and the auxiliary subroutines can be downloaded from Cloutier, Muite and Rigge[4]
|
(
) |
!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear sine-Gordon equation in 2 dimensions
! u_{tt}-u_{xx}-u_{yy}=-sin(u)
! using a second order implicit-explicit time stepping scheme.
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is set in initialdata.f90
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.1415926535...
! Lx = width of box in x direction
! Ly = width of box in y direction
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan (FFTW)
! planbxy = Backward 2d fft plan (FFTW)
! planf = Forward 2d fft plan (CUFFT)
! planb = Backward 2d fft plan (CUFFT)
! dt = timestep
! ierr = error code
! plotnum = number of plot
! .. Arrays ..
! u = approximate solution
! uold = approximate solution
! u_d = approximate solution (on GPU)
! v_d = Fourier transform of approximate solution (on GPU)
! uold_d = approximate solution (on GPU)
! vold_d = Fourier transform of approximate solution (on GPU)
! nonlinhat_d = Fourier transform of nonlinear term, sin(u) (on GPU)
! temp1 = extra space for energy computation
! temp2 = extra space for energy computation
! savearray = temp array to save out to disk
! .. Vectors ..
! kx = Fourier frequencies in x direction
! ky = Fourier frequencies in y direction
! kx_d = Fourier frequencies in x direction (on GPU)
! ky_d = Fourier frequencies in y direction (on GPU)
! x = x locations
! y = y locations
! time = times at which save data
! en = total energy
! enstr = strain energy
! enpot = potential energy
! enkin = kinetic energy
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! getgrid.f90 -- Get initial grid of points
! initialdata.f90 -- Get initial data
! enercalc.f90 -- Subroutine to calculate the energy
! savedata.f90 -- Save initial data
! External libraries required
! Cuda FFT -- http://developer.nvidia.com/cufft
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
!
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleReal *idata,
! cufftDoubleComplex *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleReal *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value :: plan
complex(fp_kind),device:: idata(1:nx,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft
PROGRAM sg2d
USE precision
USE cudafor
USE cufft
! Declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=Nx
INTEGER(kind=4), PARAMETER :: Nt=500
INTEGER(kind=4), PARAMETER :: plotgap=Nt+1
REAL(kind=8), PARAMETER :: &
pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER :: Lx=5.0d0
REAL(kind=8), PARAMETER :: Ly=5.0d0
REAL(kind=8) :: dt=0.001d0
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y
REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en
REAL(kind=8) :: scalemodes
INTEGER(kind=4) :: ierr,i,j,n,allocatestatus
INTEGER(kind=4) :: start, finish, count_rate, plotnum
INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, &
FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64
INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1
INTEGER(kind=8) :: planfxy,planbxy
CHARACTER*100 :: name_config
INTEGER(kind=4) :: planf,planb,kersize
! GPU variables
COMPLEX(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
COMPLEX(fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: vold_d,v_d,nonlinhat_d
REAL (fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: uold_d,u_d
! print run information
PRINT *,"Nx=", Nx
PRINT *,"Ny=", Ny
PRINT *,"Nt=", Nt
PRINT *,"Lx=", Lx
PRINT *,"Ly=", Ly
PRINT *,"dt=", dt
kersize=min(Nx,256)
ALLOCATE(kx(1:Nx),ky(1:Ny),kx_d(1:Nx),ky_d(1:Ny),x(1:Nx),y(1:Ny),&
u(1:Nx,1:Ny),uold(1:Nx,1:Ny),u_d(1:Nx,1:Ny),uold_d(1:Nx,1:Ny),&
v_d(1:Nx/2+1,1:Ny),vold_d(1:Nx/2+1,1:Ny),&
savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),&
enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),&
nonlinhat_d(1:Nx/2+1,1:Ny),&
temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=allocatestatus)
IF (allocatestatus .ne. 0) stop
PRINT *,'allocated arrays'
scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0))
! set up cuda ffts
call cufftPlan2D(planf,nx,ny,CUFFT_D2Z)
call cufftPlan2D(planb,nx,ny,CUFFT_Z2D)
! set up fftw ffts
CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE)
CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE)
PRINT *,'Setup FFTs'
! setup grid, wave numbers
CALL getgrid(Nx,Ny,Lx,Ly,pi,name_config,x,y,kx,ky)
kx_d=kx
ky_d=ky
PRINT *,'Got grid and fourier frequencies'
CALL initialdata(Nx,Ny,x,y,u,uold)
u_d=u
uold_d=uold
plotnum=1
name_config = 'data/u'
savearray=REAL(u)
! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking
PRINT *,'data saved'
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold)
call cufftExecD2Z(planf,u_d,v_d)
call cufftExecD2Z(planf,uold_d,vold_d)
PRINT *,'Got initial data, starting timestepping'
time(plotnum)=0.0d0
CALL system_clock(start,count_rate)
DO n=1,Nt
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
uold_d(i,j)=u_d(i,j)
END DO
END DO
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=sin(u_d(i,j))
END DO
END DO
call cufftExecD2Z(planf,u_d,nonlinhat_d)
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
nonlinhat_d(i,j)=scalemodes*( 0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*(2.0d0*v_d(i,j)+vold_d(i,j))&
+(2.0d0*v_d(i,j)-vold_d(i,j))/(dt*dt)&
-nonlinhat_d(i,j) )&
/(1/(dt*dt)-0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j)))
END DO
END DO
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
vold_d(i,j)=v_d(i,j)
END DO
END DO
!$cuf kernel do(2) <<< (1,*), (kersize,1) >>>
DO j=1,Ny
DO i=1,Nx/2+1
v_d(i,j)=nonlinhat_d(i,j)/scalemodes
END DO
END DO
call cufftExecZ2D(planb,nonlinhat_d,u_d)
IF (mod(n,plotgap)==0) then
plotnum=plotnum+1
time(plotnum)=n*dt
PRINT *,'time',n*dt
u=u_d
uold=uold_d
! savearray=REAL(u,kind(0d0)) ! disabled for benchmarking
! CALL savedata(Nx,Ny,plotnum,name_config,savearray)
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold)
END IF
END DO
CALL system_clock(finish,count_rate)
PRINT *,'Finished time stepping'
u=u_d
uold=uold_d
! compute energy at the end
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u,uold)
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
'for Time stepping'
CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))
! Save times at which output was made in text format
PRINT *,'Saved data'
call cufftDestroy(planf)
call cufftDestroy(planb)
PRINT *,'Destroy CUFFT Plans'
call dfftw_destroy_plan_(planbxy)
call dfftw_destroy_plan_(planfxy)
PRINT *,'Destroy FFTW Plans'
DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated host arrays'
DEALLOCATE(uold_d,vold_d,u_d,v_d,nonlinhat_d,&
kx_d,ky_d,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated gpu arrays'
PRINT *,'Program execution complete'
END PROGRAM sg2d
|
(
) |
!--------------------------------------------------------------------
!
!
! PURPOSE
!
! This program solves nonlinear sine-Gordon equation in 2 dimensions
! u_{tt}-u_{xx}-u_{yy}=-sin(u)
! using a second order implicit-explicit time stepping scheme.
!
! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y),
! u(x,y=0)=u(x,y=2*Ly*\pi)
! The initial condition is set in initialdata.f90
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! plotgap = number of timesteps between plots
! FFTW_IN_PLACE = value for FFTW input
! FFTW_MEASURE = value for FFTW input
! FFTW_EXHAUSTIVE = value for FFTW input
! FFTW_PATIENT = value for FFTW input
! FFTW_ESTIMATE = value for FFTW input
! FFTW_FORWARD = value for FFTW input
! FFTW_BACKWARD = value for FFTW input
! pi = 3.1415926535...
! Lx = width of box in x direction
! Ly = width of box in y direction
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! planfxy = Forward 2d fft plan (FFTW)
! planbxy = Backward 2d fft plan (FFTW)
! planf = Forward 2d fft plan (CUFFT)
! planb = Backward 2d fft plan (CUFFT)
! dt = timestep
! ierr = error code
! plotnum = number of plot
! .. Arrays ..
! u = approximate solution
! uold = approximate solution
! v = Fourier transform of approximate solution
! vold = Fourier transform of approximate solution
! nonlinhat = Fourier transform of nonlinear term, sin(u)
! temp1 = extra space for energy computation
! temp2 = extra space for energy computation
! savearray = temp array to save out to disk
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! en = total energy
! enstr = strain energy
! enpot = potential energy
! enkin = kinetic energy
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
! getgrid.f90 -- Get initial grid of points
! initialdata.f90 -- Get initial data
! enercalc.f90 -- Subroutine to calculate the energy
! savedata.f90 -- Save initial data
! External libraries required
! Cuda FFT
! OpenACC
! FFTW3 -- Fastest Fourier Transform in the West
! (http://www.fftw.org/)
! OpenMP
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
!
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleReal *idata,
! cufftDoubleComplex *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecD2Z
subroutine cufftExecD2Z(plan, idata, odata) &
& bind(C,name='cufftExecD2Z')
use iso_c_binding
use precision
integer(c_int), value :: plan
real(fp_kind), device :: idata(1:nx,1:ny)
complex(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecD2Z
end interface cufftExecD2Z
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecD2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleReal *odata)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2D
subroutine cufftExecZ2D(plan, idata, odata) &
& bind(C,name='cufftExecZ2D')
use iso_c_binding
use precision
integer(c_int),value :: plan
complex(fp_kind),device:: idata(1:nx,1:ny)
real(fp_kind),device :: odata(1:nx,1:ny)
end subroutine cufftExecZ2D
end interface cufftExecZ2D
end module cufft
PROGRAM sg2d
USE precision
USE cufft
USE openacc
! Declare variables
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=Nx
INTEGER(kind=4), PARAMETER :: Nt=500
INTEGER(kind=4), PARAMETER :: plotgap=Nt+1
REAL(kind=8), PARAMETER :: &
pi=3.14159265358979323846264338327950288419716939937510d0
REAL(kind=8), PARAMETER :: Lx=5.0d0
REAL(kind=8), PARAMETER :: Ly=5.0d0
REAL(kind=8) :: dt=0.001d0
COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y
REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold
COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2,v,vold,nonlinhat
REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray
REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en
INTEGER(kind=4) :: ierr,i,j,n,allocatestatus
INTEGER(kind=4) :: start, finish, count_rate, plotnum
INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, &
FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64
INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1
INTEGER(kind=8) :: planfxy,planbxy
CHARACTER*100 :: name_config
INTEGER(kind=4) :: planf,planb
! print run information
PRINT *,"Nx=", Nx
PRINT *,"Ny=", Ny
PRINT *,"Nt=", Nt
PRINT *,"Lx=", Lx
PRINT *,"Ly=", Ly
PRINT *,"dt=", dt
ALLOCATE(kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),uold(1:Nx,1:Ny),&
v(1:Nx/2+1,1:Ny),vold(1:Nx/2+1,1:Ny),nonlinhat(1:Nx/2+1,1:Ny),&
savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),&
enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),&
temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),&
stat=allocatestatus)
IF (allocatestatus .ne. 0) stop
PRINT *,'allocated arrays'
! set up cuda ffts
call cufftPlan2D(planf,nx,ny,CUFFT_D2Z)
call cufftPlan2D(planb,nx,ny,CUFFT_Z2D)
! set up fftw ffts
CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE)
CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE)
PRINT *,'Setup FFTs'
! setup grid, wave numbers
!$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u)
!$acc kernels loop
DO i=1,1+Nx/2
kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx
END DO
!$acc end kernels
kx(1+Nx/2)=0.0d0
!$acc kernels loop
DO i = 1,Nx/2 -1
kx(i+1+Nx/2)=-kx(1-i+Nx/2)
END DO
!$acc end kernels
!$acc kernels loop
DO i=1,Nx
x(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)))*pi*Lx
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,1+Ny/2
ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly
END DO
!$acc end kernels
ky(1+Ny/2)=0.0d0
!$acc kernels loop
DO j = 1,Ny/2 -1
ky(j+1+Ny/2)=-ky(1-j+Ny/2)
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,Ny
y(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)))*pi*Ly
END DO
!$acc end kernels
PRINT *,'Got grid and fourier frequencies'
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
u(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
uold(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2))
END DO
END DO
!$acc end kernels
savearray=REAL(u)
plotnum=1
name_config = 'data/u'
! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking
PRINT *,'data saved'
!$acc end data
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),&
enpot(plotnum),en(plotnum),kx(1:Nx),ky(1:Ny),temp1,temp2,&
u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
!$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u)
call cufftExecD2Z(planf,u,v)
call cufftExecD2Z(planf,uold,vold)
PRINT *,'Got initial data, starting timestepping'
time(plotnum)=0.0d0
CALL system_clock(start,count_rate)
DO n=1,Nt
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx
uold(i,j)=u(i,j)
u(i,j)=sin(u(i,j))
END DO
END DO
!$acc end kernels
call cufftExecD2Z(planf,u,nonlinhat)
!$acc kernels loop
DO j=1,Ny
DO i=1,Nx/2+1
nonlinhat(i,j)=( 0.25*(kx(i)*kx(i) + ky(j)*ky(j))&
*(2.0d0*v(i,j)+vold(i,j))+(2.0d0*v(i,j)-vold(i,j))/(dt*dt)&
-nonlinhat(i,j) )/(1/(dt*dt)-0.25*(kx(i)*kx(i) + ky(j)*ky(j)))
vold(i,j)=v(i,j)
v(i,j)=nonlinhat(i,j)
! prescale nonlinhat
nonlinhat(i,j)=nonlinhat(i,j)/REAL(Nx*Ny,kind(0d0))
END DO
END DO
!$acc end kernels
call cufftExecZ2D(planb,nonlinhat,u)
END DO
CALL system_clock(finish,count_rate)
!$acc end data
PRINT *,'Finished time stepping'
! compute energy at the end
! savearray=REAL(u(1:Nx,1:Ny),kind(0d0)) ! disabled for benchmarking
! CALL savedata(Nx,Ny,plotnum+1,name_config,savearray)
CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),&
enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u(1:Nx,1:Ny),uold(1:Nx,1:Ny))
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),&
'for Time stepping'
CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),&
enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1))
! Save times at which output was made in text format
PRINT *,'Saved data'
call cufftDestroy(planf)
call cufftDestroy(planb)
PRINT *,'Destroy CUFFT Plans'
call dfftw_destroy_plan_(planbxy)
call dfftw_destroy_plan_(planfxy)
PRINT *,'Destroy FFTW Plans'
DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,&
stat=allocatestatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Deallocated host arrays'
PRINT *,'Program execution complete'
END PROGRAM sg2d
Notes
[edit | edit source]- ↑ Cloutier, Muite and Rigge (2012)
- ↑ Cloutier, Muite and Rigge (2012)
- ↑ Cloutier, Muite and Rigge (2012)
- ↑ Cloutier, Muite and Rigge (2012)
References
[edit | edit source]B., Cloutier; B.K., Muite; P., Rigge. "Performance of FORTRAN and C GPU Extensions for a Benchmark Suite of Fourier Pseudospectral Algorithms". Proceedings of the Symposium on Application Accelerators in High Performance computing. IEEE. http://arxiv.org/abs/1206.3215.