/*
Simulation of flow inside a 2D square cavity
using the lattice Boltzmann method (LBM)
Written by: Abhijit Joshi (joshi1974@gmail.com)
Build instructions: nvcc -arch=sm_13 gpu_lbm.cu -o lbmGPU.x
Run instructions: ./lbmGPU.x
*/
#include<iostream>
// problem parameters
const int N = 512; // number of node points along X and Y (cavity length in lattice units)
const int TIME_STEPS = 1000; // number of time steps for which the simulation is run
const int NDIR = 9; // number of discrete velocity directions used in the D2Q9 model
const double DENSITY = 2.7; // fluid density in lattice units
const double LID_VELOCITY = 0.05; // lid velocity in lattice units
const double REYNOLDS_NUMBER = 100; // Re =
__global__ void initialize(double *_ex, double *_ey, double *_alpha, int *_ant, double *_rh, double *_ux, double *_uy, double *_f, double *_feq, double *_f_new)
{
// these could be initialized directly inside the GPU kernel and moved out of the CPU part...
_ex[0] = 0.0; _ey[0] = 0.0; _alpha[0] = 4.0 / 9.0;
_ex[1] = 1.0; _ey[1] = 0.0; _alpha[1] = 1.0 / 9.0;
_ex[2] = 0.0; _ey[2] = 1.0; _alpha[2] = 1.0 / 9.0;
_ex[3] = -1.0; _ey[3] = 0.0; _alpha[3] = 1.0 / 9.0;
_ex[4] = 0.0; _ey[4] = -1.0; _alpha[4] = 1.0 / 9.0;
_ex[5] = 1.0; _ey[5] = 1.0; _alpha[5] = 1.0 / 36.0;
_ex[6] = -1.0; _ey[6] = 1.0; _alpha[6] = 1.0 / 36.0;
_ex[7] = -1.0; _ey[7] = -1.0; _alpha[7] = 1.0 / 36.0;
_ex[8] = 1.0; _ey[8] = -1.0; _alpha[8] = 1.0 / 36.0;
// these could be initialized directly inside the GPU kernel and moved out of the CPU part...
_ant[0] = 0; // 6 2 5
_ant[1] = 3; // ^
_ant[2] = 4; // |
_ant[3] = 1; // |
_ant[4] = 2; // 3 <----- 0 -----> 1
_ant[5] = 7; // |
_ant[6] = 8; // |
_ant[7] = 5; // v
_ant[8] = 6; // 7 4 8
// compute the "i" and "j" location and the "dir"
// handled by this thread
int i = blockIdx.x * blockDim.x + threadIdx.x ;
int j = blockIdx.y * blockDim.y + threadIdx.y ;
// initialize density and velocity fields inside the cavity
_rh[i+N*j] = DENSITY;
_ux[i+N*j] = 0;
_uy[i+N*j] = 0;
if(j==N-1) _ux[i+N*(N-1)] = LID_VELOCITY;
// assign initial values for distribution functions
int ixy = i+N*j;
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N;
double edotu = _ex[dir]*_ux[ixy] + _ey[dir]*_uy[ixy];
double udotu = _ux[ixy]*_ux[ixy] + _uy[ixy]*_uy[ixy];
_feq[index] = _rh[ixy] * _alpha[dir] * (1 + 3*edotu + 4.5*edotu*edotu - 1.5*udotu);
_f[index] = _feq[index];
_f_new[index] = _feq[index];
}
}
__global__ void timeIntegration(double *_ex, double *_ey, double *_alpha, int *_ant, double *_rh, double *_ux, double *_uy, double *_f, double *_feq, double *_f_new)
{
// calculate fluid viscosity based on the Reynolds number
double kinematicViscosity = LID_VELOCITY * (double)N / REYNOLDS_NUMBER;
// calculate relaxation time tau
double tau = 0.5 + 3.0 * kinematicViscosity;
// compute the "i" and "j" location and the "dir"
// handled by this thread
int i = blockIdx.x * blockDim.x + threadIdx.x ;
int j = blockIdx.y * blockDim.y + threadIdx.y ;
{
// collision
if((i>0) && (i<N-1) && (j>0) && (j<N-1)) {
int ixy = i+N*j;
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N;
double edotu = _ex[dir]*_ux[ixy] + _ey[dir]*_uy[ixy];
double udotu = _ux[ixy]*_ux[ixy] + _uy[ixy]*_uy[ixy];
_feq[index] = _rh[ixy] * _alpha[dir] * (1 + 3*edotu + 4.5*edotu*edotu - 1.5*udotu);
}
}
// streaming from interior node points
if((i>0) && (i<N-1) && (j>0) && (j<N-1)) {
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N; // (i,j,dir)
int index_new = (i+_ex[dir]) + (j+_ey[dir])*N + dir*N*N;
int index_ant = i + j*N + _ant[dir]*N*N;
// post-collision distribution at (i,j) along "dir"
double f_plus = _f[index] - (_f[index] - _feq[index])/tau;
if((i+_ex[dir]==0) || (i+_ex[dir]==N-1) || (j+_ey[dir]==0) || (j+_ey[dir]==N-1)) {
// bounce back
int ixy = i+_ex[dir] + N*(j+_ey[dir]);
double ubdote = _ux[ixy]*_ex[dir] + _uy[ixy]*_ey[dir];
_f_new[index_ant] = f_plus - 6.0 * DENSITY * _alpha[dir] * ubdote;
}
else {
// stream to neighbor
_f_new[index_new] = f_plus;
}
}
}
// push f_new into f
if((i>0) && (i<N-1) && (j>0) && (j<N-1)) {
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N; // (i,j,dir)
_f[index] = _f_new[index];
}
}
// update density at interior nodes
if((i>0) && (i<N-1) && (j>0) && (j<N-1)) {
double rho=0;
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N;
rho+=_f_new[index];
}
_rh[i+N*j] = rho;
}
// update velocity at interior nodes
if((i>0) && (i<N-1) && (j>0) && (j<N-1)) {
double velx=0;
double vely=0;
for(int dir=0;dir<NDIR;dir++) {
int index = i+j*N+dir*N*N;
velx+=_f_new[index]*_ex[dir];
vely+=_f_new[index]*_ey[dir];
}
_ux[i+N*j] = velx/_rh[i+N*j];
_uy[i+N*j] = vely/_rh[i+N*j];
}
}
}
int main(int argc, char *argv[])
{
// the base vectors and associated weight coefficients (GPU)
double *_ex, *_ey, *_alpha; // pointers to device (GPU) memory
cudaMalloc((void **)&_ex,NDIR*sizeof(double));
cudaMalloc((void **)&_ey,NDIR*sizeof(double));
cudaMalloc((void **)&_alpha,NDIR*sizeof(double));
// ant vector (GPU)
int *_ant; // gpu memory
cudaMalloc((void **)&_ant,NDIR*sizeof(int));
// allocate memory on the GPU
double *_f, *_feq, *_f_new;
cudaMalloc((void **)&_f,N*N*NDIR*sizeof(double));
cudaMalloc((void **)&_feq,N*N*NDIR*sizeof(double));
cudaMalloc((void **)&_f_new,N*N*NDIR*sizeof(double));
double *_rh, *_ux, *_uy;
cudaMalloc((void **)&_rh,N*N*sizeof(double));
cudaMalloc((void **)&_ux,N*N*sizeof(double));
cudaMalloc((void **)&_uy,N*N*sizeof(double));
// assign a 3D distribution of CUDA "threads" within each CUDA "block"
int threadsAlongX=16, threadsAlongY=16;
dim3 dimBlock(threadsAlongX, threadsAlongY, 1);
// calculate number of blocks along X and Y in a 2D CUDA "grid"
dim3 dimGrid( ceil(float(N)/float(dimBlock.x)), ceil(float(N)/float(dimBlock.y)), 1 );
initialize<<<dimGrid,dimBlock>>>(_ex, _ey, _alpha, _ant, _rh, _ux, _uy, _f, _feq, _f_new);
// time integration
int time=0;
while(time<TIME_STEPS) {
time++;
timeIntegration<<<dimGrid,dimBlock >>>(_ex, _ey, _alpha, _ant, _rh, _ux, _uy, _f, _feq, _f_new);
}
return 0;
}