/*

Solution of the 2D Laplace equation for heat conduction in a square plate

CUDA version

written by: Abhijit Joshi (joshi1974@gmail.com)
*/

#include <iostream>

// global variables

const int NX = 256;      // mesh size (number of node points along X)
const int NY = 256;      // mesh size (number of node points along Y)

const int MAX_ITER=1000;  // number of Jacobi iterations

// device function to update the array T_new based on the values in array T_old
// note that all locations are updated simultaneously on the GPU 
__global__ void Laplace(float *T_old, float *T_new)
{
    // compute the "i" and "j" location of the node point
    // handled by this thread

    int i = blockIdx.x * blockDim.x + threadIdx.x ;
    int j = blockIdx.y * blockDim.y + threadIdx.y ;

    // get the natural index values of node (i,j) and its neighboring nodes
                                //                         N 
    int P = i + j*NX;           // node (i,j)              |
    int N = i + (j+1)*NX;       // node (i,j+1)            |
    int S = i + (j-1)*NX;       // node (i,j-1)     W ---- P ---- E
    int E = (i+1) + j*NX;       // node (i+1,j)            |
    int W = (i-1) + j*NX;       // node (i-1,j)            |
                                //                         S 

    // only update "interior" node points
    if(i>0 && i<NX-1 && j>0 && j<NY-1) {
        T_new[P] = 0.25*( T_old[E] + T_old[W] + T_old[N] + T_old[S] );
    }
}

// initialization

void Initialize(float *TEMPERATURE)
{
    for(int i=0;i<NX;i++) {
        for(int j=0;j<NY;j++) {
            int index = i + j*NX;
            TEMPERATURE[index]=0.0;
        }
    }

    // set left wall to 1

    for(int j=0;j<NY;j++) {
        int index = j*NX;
        TEMPERATURE[index]=1.0;
    }
}

int main(int argc,char **argv)
{
    float *T;          // pointer to host (CPU) memory
    float *_T1, *_T2;  // pointers to device (GPU) memory

    // allocate a "pre-computation" T array on the host
    float *T = new float [NX*NY];

    // initialize array on the host
    Initialize(T);

    // allocate storage space on the GPU
    cudaMalloc((void **)&_T1,NX*NY*sizeof(float));
    cudaMalloc((void **)&_T2,NX*NY*sizeof(float));

    // copy (initialized) host arrays to the GPU memory from CPU memory
    cudaMemcpy(_T1,T,NX*NY*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(_T2,T,NX*NY*sizeof(float),cudaMemcpyHostToDevice);

    // assign a 2D distribution of CUDA "threads" within each CUDA "block"    
    int ThreadsPerBlock=16;
    dim3 dimBlock( ThreadsPerBlock, ThreadsPerBlock );

    // calculate number of blocks along X and Y in a 2D CUDA "grid"
    dim3 dimGrid( ceil(float(NX)/float(dimBlock.x)), ceil(float(NY)/float(dimBlock.y)), 1 );

    // begin Jacobi iteration
    int k = 0;
    while(k<MAX_ITER) {
        Laplace<<<dimGrid, dimBlock>>>(_T1,_T2);   // update T1 using data stored in T2
        Laplace<<<dimGrid, dimBlock>>>(_T2,_T1);   // update T2 using data stored in T1
        k+=2;
    }

    // copy final array to the CPU from the GPU 
    cudaMemcpy(T,_T2,NX*NY*sizeof(float),cudaMemcpyDeviceToHost);
    cudaThreadSynchronize();
/*
    // print the results to screen
    for (int j=NY-1;j>=0;j--) {
        for (int i=0;i<NX;i++) {
            int index = i + j*NX;
            std::cout << T[index] << " ";
        }
        std::cout << std::endl;
    }
*/
    // release memory on the host 
    delete T;

    // release memory on the device 
    cudaFree(_T1);
    cudaFree(_T2);

    return 0;
}