Simple CUDA-GPU Implementation

Naive version. The only attempt at optimisation has been to load Ex(n) etc into registers since otherwise it is read twice, and Nx, NxNy, NxNyNz, 1/dx, 1/dy and 1/dz are placed in constant memory. Since the 12 coefficient arrays C1 and C2 for Ex,Ey,Ez and Hx,Hy,Hz are aligned automatically when allocated memory by Cudamalloc, as well as the field arrays Ex,Ey,Ez and Hx,Hy,Hz, then speed of execution is actually quite acceptable. The main problem in this version of CFDTD comes with the 12 coefficients that are needed for each FDTD cell update. This means that the problem size that can fit onto the GPU is limited since we only have about 1.5GB memory to work with.

 

Shows the transverse field inside a coaxial resonator. In this CUDA-GPU implementation the memory transfers are not optimised so there are 42 R/W per FDTD cycle. Given the memory bandwidth we can estimate the maximum throughput at about 1000Mcells/s for the GTX480. We achieve about 950.

 

__constant__ float Rdx[256];

__constant__ float Rdy[256];

__constant__ float Rdz[256];

__constant__ int Nx;

__constant__ int NxNy;

__constant__ int NxNyNz;

 

E-field update

__global__ void cuYeeE( float* C1Ex,  float* C1Ey,  float* C1Ez, float* C2Ex,  float* C2Ey,  float* C2Ez,

float* Hx, float* Hy, float* Hz, float* Ex, float* Ey, float* Ez)

{

    int n = blockDim.x * blockIdx.x + threadIdx.x;

    int i,j,k;

    if ((n < NxNyNz)&&(n > NxNy)){

k = n/NxNy; //k-plane identifier

j = (n%NxNy)/Nx; //j-line on plane k identifier

i = (n%NxNy)%Nx; //i-point in line j on plane k identifier

float Hxn, Hyn, Hzn;

Hxn=Hx[n];

Hyn=Hy[n];

Hzn=Hz[n];

Ex[n] = C1Ex[n] * Ex[n] + C2Ex[n] * ((Hzn - Hz[n - Nx]) * Rdy[j] + (Hy[n - NxNy] - Hyn) * Rdz[k]);

        Ey[n] = C1Ey[n] * Ey[n] + C2Ey[n] * ((Hxn - Hx[n - NxNy]) * Rdz[k] + (Hz[n - 1] - Hzn) * Rdx[i]);

        Ez[n] = C1Ez[n] * Ez[n] + C2Ez[n] * ((Hyn - Hy[n - 1]) * Rdx[i] + (Hx[n - Nx] - Hxn) * Rdy[j]);

        __syncthreads();

        }

}

H field update

__global__ void cuYeeH( float* C1Hx,  float* C1Hy,  float* C1Hz, float* C2Hx,  float* C2Hy,

float* C2Hz, float* Hx, float* Hy, float* Hz, float* Ex, float* Ey, float* Ez)

{

    int n = blockDim.x * blockIdx.x + threadIdx.x; //    int i = blockDim.x * blockIdx.x + threadIdx.x;

    int i,j,k;

    if (n < (NxNyNz-NxNy)){

k = n/NxNy; //k-plane identifier

j = (n%NxNy)/Nx; //j-line on plane k identifier

i = (n%NxNy)%Nx; //i-point in line j on plane k identifier

float Exn, Eyn, Ezn;

Exn=Ex[n];

Eyn=Ey[n];

Ezn=Ez[n];

Hx[n]=C1Hx[n]*Hx[n]+C2Hx[n]*((Ey[n+NxNy]-Eyn)*Rdz[k]+(Ezn-Ez[n+Nx])*Rdy[j]);

Hy[n]=C1Hy[n]*Hy[n]+C2Hy[n]*((Ez[n+1]-Ezn)*Rdx[i]+(Exn-Ex[n+NxNy])*Rdz[k]);

Hz[n]=C1Hz[n]*Hz[n]+C2Hz[n]*((Ex[n+Nx]-Exn)*Rdy[j]+(Eyn-Ey[n+1])*Rdx[i]);

        __syncthreads();

}

}

 

Campus d'excel·lència internacional U A B