Abhijit Joshi

ENGINEER. PROGRAMMER. ARTIST.

Introduction to OpenCL

Quoting from the Khronos Group website :

OpenCL is the first open, royalty-free standard for cross-platform, parallel programming of modern processors found in personal computers, servers and handheld/embedded devices. OpenCL (Open Computing Language) greatly improves speed and responsiveness for a wide spectrum of applications in numerous market categories from gaming and entertainment to scientific and medical software.

To put it more simply, OpenCL is a cross-platform alternative to CUDA.

OpenCL is truly the language for heterogeneous computing. In these tutorials, we will explore this programming model in depth. Most of the initial code examples will be very similar to those presented in my tutorials about CUDA. As you will discover soon, the underlying concepts in both CUDA and OpenCL are very similar, but are called by different names.

Unlike CUDA, where "device" typically refers to a GPU, OpenCL devices include almost anything that can be used for computing. Here are the various options that can be specified when searching for devices:

CL_DEVICE_TYPE_DEFAULT

CL_DEVICE_TYPE_CPU

CL_DEVICE_TYPE_GPU

CL_DEVICE_TYPE_ACCELERATOR

CL_DEVICE_TYPE_ALL

OpenCL: Tutorial 001

Anytime you encounter a new platform on which you plan to develop applications using OpenCL, you will need to find out how many devices you have and their specifications. The code below answers these questions.

Create a C++ file called deviceInfo.cpp and copy the following code into this file.


#include <iostream>
#include <OpenCL/cl.h>

int main(void)
{
    // define host and device data structures

    cl_platform_id platform;
    cl_device_id *devices;
    cl_uint num_devices;

    // Identify a platform

    cl_uint num_entries = 2;     // maximum number of platforms we are interested in detecting
    cl_uint num_platforms = 0;   // number of platforms actually found during run-time

    cl_int err = clGetPlatformIDs(num_entries, &platform, &num_platforms);

    if(err < 0) {
        std::cout << "Couldn't find any platforms\n";
        exit(1);
    }

    std::cout << "Detected " << num_platforms << " platforms\n";
    std::cout << "Using OpenCL to get more information about the platform:\n\n";

    char pform_name  [40];        // platform name
    char pform_vendor[40];        // platform vendor
    char pform_version[40];       // platform version
    char pform_profile[40];       // platform profile
    char pform_extensions[4096];  // platform extensions

    clGetPlatformInfo(platform, CL_PLATFORM_NAME,       sizeof(pform_name  ),     &pform_name,       NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_VENDOR,     sizeof(pform_vendor),     &pform_vendor,     NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION,    sizeof(pform_version),    &pform_version,    NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_PROFILE,    sizeof(pform_profile),    &pform_profile,    NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, sizeof(pform_extensions), &pform_extensions, NULL);
        
    std::cout << "CL_PLATFORM_NAME       --- " << pform_name       << std::endl;
    std::cout << "CL_PLATFORM_VENDOR     --- " << pform_vendor     << std::endl;
    std::cout << "CL_PLATFORM_VERSION    --- " << pform_version    << std::endl;
    std::cout << "CL_PLATFORM_PROFILE    --- " << pform_profile    << std::endl;
    std::cout << "CL_PLATFORM_EXTENSIONS --- " << pform_extensions << std::endl;
        
    // Determine number of connected devices
        
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
        
    // Access connected devices
        
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * num_devices); 
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
        
    // Extension data
        
    char ext_data[4096];
    char name_data[48];
        
    cl_bool  deviceAvl;
    cl_uint  deviceAdd;
    cl_uint  frequency;
    cl_ulong globalMem;
    cl_ulong blockSize;
    cl_ulong sharedMem;
    cl_uint  witemDims;
    cl_ulong xyzDims[3];
    char     deviceExt[4096];

    // Obtain specifications for each connected device

    std::cout << std::endl;
    std::cout << "Running OpenCL code to get device specifications:\n\n";

    for(unsigned int i=0; i<num_devices; i++) {

        clGetDeviceInfo (devices[i], CL_DEVICE_NAME                      , sizeof(name_data), name_data  , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_AVAILABLE                 , sizeof(ext_data) , &deviceAvl , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_ADDRESS_BITS              , sizeof(ext_data) , &deviceAdd , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY       , sizeof(ext_data) , &frequency , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_GLOBAL_MEM_SIZE           , sizeof(ext_data) , &globalMem , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE       , sizeof(ext_data) , &blockSize , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_LOCAL_MEM_SIZE            , sizeof(ext_data) , &sharedMem , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS  , sizeof(ext_data) , &witemDims , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES       , sizeof(ext_data) , &xyzDims[0], NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_EXTENSIONS                , sizeof(deviceExt), deviceExt  , NULL);

        std::cout << "------------------------------------------------------------------------" << std::endl;
        std::cout << "CL_DEVICE_NAME                     " << name_data << std::endl;
        std::cout << "CL_DEVICE_AVAILABLE                " << deviceAvl << std::endl;
        std::cout << "CL_DEVICE_ADDRESS_BITS             " << deviceAdd               << " bits"       << std::endl;
        std::cout << "CL_DEVICE_MAX_CLOCK_FREQUENCY      " << (float)  frequency/1000 << " MHz"        << std::endl;
        std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE          " << (double) globalMem/1E9  << " GB "        << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE      " << blockSize               << " work items" << std::endl;
        std::cout << "CL_DEVICE_LOCAL_MEM_SIZE           " << sharedMem               << " Bytes"      << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS " << witemDims                                << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_ITEM_SIZES : X  " << xyzDims[0]              << " work items" << std::endl;
        std::cout << "                              : Y  " << xyzDims[1]              << " work items" << std::endl;
        std::cout << "                              : Z  " << xyzDims[2]              << " work items" << std::endl;
        std::cout << "CL_DEVICE_EXTENSIONS               " << deviceExt << std::endl;

        std::cout << std::endl;
    }

    free(devices);
    return 0;
}

Use the following Makefile to compile the above code:


all:
    g++ -framework OpenCL deviceInfo.cpp -o deviceInfo.x

clean:
    rm *.x

On my 2013 Macbook Pro, I get the following result:

bash-3.2$ ./deviceInfo.x 
Detected 1 platforms
Using OpenCL to get more information about the platform:

CL_PLATFORM_NAME       --- Apple
CL_PLATFORM_VENDOR     --- Apple
CL_PLATFORM_VERSION    --- OpenCL 1.2 (Oct  1 2013 18:21:51)
CL_PLATFORM_PROFILE    --- FULL_PROFILE
CL_PLATFORM_EXTENSIONS --- cl_APPLE_SetMemObjectDestructor 
                           cl_APPLE_ContextLoggingFunctions 
                           cl_APPLE_clut 
                           cl_APPLE_query_kernel_names 
                           cl_APPLE_gl_sharing 
                           cl_khr_gl_event

Running OpenCL code to get device specifications:

-----------------------------------------------------------------------------
CL_DEVICE_NAME                     Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      2.3 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          17.1799 GB 
CL_DEVICE_MAX_WORK_GROUP_SIZE      1024 work items
CL_DEVICE_LOCAL_MEM_SIZE           32768 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  1024 work items
                              : Y  1 work items
                              : Z  1 work items
CL_DEVICE_EXTENSIONS               cl_APPLE_SetMemObjectDestructor 
                                   cl_APPLE_ContextLoggingFunctions 
                                   cl_APPLE_clut 
                                   cl_APPLE_query_kernel_names 
                                   cl_APPLE_gl_sharing 
                                   cl_khr_gl_event 
                                   cl_khr_fp64 
                                   cl_khr_global_int32_base_atomics 
                                   cl_khr_global_int32_extended_atomics 
                                   cl_khr_local_int32_base_atomics 
                                   cl_khr_local_int32_extended_atomics 
                                   cl_khr_byte_addressable_store 
                                   cl_khr_int64_base_atomics 
                                   cl_khr_int64_extended_atomics 
                                   cl_khr_3d_image_writes 
                                   cl_khr_image2d_from_buffer 
                                   cl_APPLE_fp64_basic_ops 
                                   cl_APPLE_fixed_alpha_channel_orders 
                                   cl_APPLE_biased_fixed_point_image_formats 
                                   cl_APPLE_command_queue_priority

-----------------------------------------------------------------------------
CL_DEVICE_NAME                     GeForce GT 750M
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             32 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      0.925 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          2.14748 GB 
CL_DEVICE_MAX_WORK_GROUP_SIZE      1024 work items
CL_DEVICE_LOCAL_MEM_SIZE           49152 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  1024 work items
                              : Y  1024 work items
                              : Z  64 work items
CL_DEVICE_EXTENSIONS               cl_APPLE_SetMemObjectDestructor
                                   cl_APPLE_ContextLoggingFunctions 
                                   cl_APPLE_clut 
                                   cl_APPLE_query_kernel_names 
                                   cl_APPLE_gl_sharing 
                                   cl_khr_gl_event 
                                   cl_khr_byte_addressable_store 
                                   cl_khr_global_int32_base_atomics 
                                   cl_khr_global_int32_extended_atomics 
                                   cl_khr_local_int32_base_atomics 
                                   cl_khr_local_int32_extended_atomics 
                                   cl_APPLE_fp64_basic_ops 
                                   cl_khr_fp64 cl_khr_3d_image_writes 
                                   cl_khr_depth_images 
                                   cl_khr_gl_depth_images 
                                   cl_khr_gl_msaa_sharing 
                                   cl_khr_image2d_from_buffer 

-----------------------------------------------------------------------------
CL_DEVICE_NAME                     Iris Pro
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      1.2 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          1.07374 GB 
CL_DEVICE_MAX_WORK_GROUP_SIZE      512 work items
CL_DEVICE_LOCAL_MEM_SIZE           65536 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  512 work items
                              : Y  512 work items
                              : Z  512 work items
CL_DEVICE_EXTENSIONS               cl_APPLE_SetMemObjectDestructor 
                                   cl_APPLE_ContextLoggingFunctions 
                                   cl_APPLE_clut 
                                   cl_APPLE_query_kernel_names 
                                   cl_APPLE_gl_sharing 
                                   cl_khr_gl_event 
                                   cl_khr_global_int32_base_atomics 
                                   cl_khr_global_int32_extended_atomics 
                                   cl_khr_local_int32_base_atomics 
                                   cl_khr_local_int32_extended_atomics 
                                   cl_khr_byte_addressable_store 
                                   cl_khr_image2d_from_buffer 
                                   cl_khr_gl_depth_images 
                                   cl_khr_depth_images 

Terribly sorry for bombarding you with the extensions there, but I wanted to give it exactly like it is.

Before moving on, let's become familiar with some OpenCL terminology. The definitions below are taken from the OpenCL-2.0 Specification, released on 14 November 2013.

Platform
The host (typically the CPU) plus a collection of devices managed by the OpenCL framework that allow an application to share resources and execute kernels on devices in the platform.
Context
The environment within which the kernels execute and the domain in which synchronization and memory management is defined. The context includes a set of devices, the memory accessible to those devices, the corresponding memory properties and one or more command-queues used to schedule execution of a kernel(s) or operations on memory objects.
Global memory
A memory region accessible to all work-items executing in a context.

For the above result, we have one platform and three OpenCL devices:

We have not really defined a context in the code above, because we did not write and execute any kernels so far. This will be the subject of tutorial 002.

OpenCL: Tutorial 002: Vector Addition

The code in tutorial 001 will now be extended to implement vector addition. Let us begin by presenting the kernel code. Create a file called kernel_add.cl and copy the following code there:


// OpenCL kernel for vector addition

__kernel void add(   const int N,      // number of elements in the vector
                  __global int *a,     // input vector 1
                  __global int *b,     // input vector 2
                  __global int *c)     // output vector
{
    // get global location of this work-item

    int index = get_global_id(0);

    // perform addition with bound checking

    if (index < N)
        c[index] = a[index] + b[index];
}

Kernel files in OpenCL are typically given the extension .cl and are stored separately from the host code.

The get_global_id(0) function determines the global index of this particular kernel instance, and is used to operate on that particular element in the array.

In tutorial 001, we found that there were three devices attached to our platform. We now have complete freedom to pick and choose which device or set of devices we want to execute our kernel on. This is referred to as creating a context and is done within the host code.

Create the host code for this project ( vecAdd.cpp) and copy the following code there. Admittedly, OpenCL can get quite verbose compared to CUDA because of all the boilerplate code required to find platforms and connect devices. This is mainly because CUDA does several of these things behind the scenes and thus hides this complexity from the end user.

 
#include <iostream>
#include <cmath>
#include <OpenCL/cl.h>

int main(void)
{
    // Identify a platform

    cl_platform_id platform;

    cl_uint num_entries = 1;     // maximum number of platforms we are interested in detecting
    cl_uint num_platforms = 0;   // number of platforms actually found during run-time

    cl_int err = clGetPlatformIDs(num_entries, &platform, &num_platforms);

    // error check
    if(err < 0) {
        std::cout << "Couldn't find any platforms\n";
        exit(1);
    }

    std::cout << "Detected " << num_platforms << " platforms\n";
    std::cout << "Using OpenCL to get more information about the platform:\n\n";

    char pform_name  [40];        // platform name
    char pform_vendor[40];        // platform vendor
    char pform_version[40];       // platform version
    char pform_profile[40];       // platform profile
    char pform_extensions[4096];  // platform extensions

    clGetPlatformInfo(platform, CL_PLATFORM_NAME,       sizeof(pform_name  ),     &pform_name,       NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_VENDOR,     sizeof(pform_vendor),     &pform_vendor,     NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION,    sizeof(pform_version),    &pform_version,    NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_PROFILE,    sizeof(pform_profile),    &pform_profile,    NULL);
    clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, sizeof(pform_extensions), &pform_extensions, NULL);

    std::cout << "CL_PLATFORM_NAME       --- " << pform_name       << std::endl;
    std::cout << "CL_PLATFORM_VENDOR     --- " << pform_vendor     << std::endl;
    std::cout << "CL_PLATFORM_VERSION    --- " << pform_version    << std::endl;
    std::cout << "CL_PLATFORM_VERSION    --- " << pform_profile    << std::endl;
    std::cout << "CL_PLATFORM_EXTENSIONS --- " << pform_extensions << std::endl;

    // Determine number of devices connected to this platform

    cl_uint       numOfDevices;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, NULL, &numOfDevices);

    // error check

    if(err < 0) {
        std::cout << "Couldn't find any devices\n";
        exit(1);
    }

    std::cout << "\nNumber of connected devices found = " << numOfDevices << std::endl;

    // allocate memory to store devices

    cl_device_id* devices = (cl_device_id*) malloc(sizeof(cl_device_id) * numOfDevices);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);

    // Extension data

    char ext_data[4096];
    char name_data[48];

    cl_bool  deviceAvl;
    cl_uint  deviceAdd;
    cl_uint  frequency;
    cl_ulong globalMem;
    cl_ulong blockSize;
    cl_ulong sharedMem;
    cl_uint  witemDims;
    cl_ulong xyzDims[3];

    // Obtain specifications for each connected GPU

    std::cout << std::endl;
    std::cout << "Running OpenCL code to get device specifications:\n\n";

    for(unsigned int i=0; i<numOfDevices; i++) {

        clGetDeviceInfo (devices[i], CL_DEVICE_NAME                      , sizeof(name_data), name_data  , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_AVAILABLE                 , sizeof(ext_data) , &deviceAvl , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_ADDRESS_BITS              , sizeof(ext_data) , &deviceAdd , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY       , sizeof(ext_data) , &frequency , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_GLOBAL_MEM_SIZE           , sizeof(ext_data) , &globalMem , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE       , sizeof(ext_data) , &blockSize , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_LOCAL_MEM_SIZE            , sizeof(ext_data) , &sharedMem , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS  , sizeof(ext_data) , &witemDims , NULL);
        clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES       , sizeof(ext_data) , &xyzDims[0], NULL);

        std::cout << "-------------------------" << std::endl;
        std::cout << "CL_DEVICE_NAME                     " << name_data << std::endl;
        std::cout << "CL_DEVICE_AVAILABLE                " << deviceAvl << std::endl;
        std::cout << "CL_DEVICE_ADDRESS_BITS             " << deviceAdd               << " bits"       << std::endl;
        std::cout << "CL_DEVICE_MAX_CLOCK_FREQUENCY      " << (float)  frequency/1000 << " MHz"        << std::endl;
        std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE          " << (double) globalMem/1E9  << " GB "        << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE      " << blockSize               << " work items" << std::endl;
        std::cout << "CL_DEVICE_LOCAL_MEM_SIZE           " << sharedMem               << " Bytes"      << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS " << witemDims                                << std::endl;
        std::cout << "CL_DEVICE_MAX_WORK_ITEM_SIZES : X  " << xyzDims[0]              << " work items" << std::endl;
        std::cout << "                              : Y  " << xyzDims[1]              << " work items" << std::endl;
        std::cout << "                              : Z  " << xyzDims[2]              << " work items" << std::endl;

        std::cout << std::endl;
    }

So far, the code is pretty much identical to tutorial 001. We have identified a platform and have identified and printed out information about all devices attached to it.

Next, we select devices and create a context in which our kernel will execute.


    // create a context that uses some or all available devices

    std::cout << "\nCreating a context (grouping devices on which kernels will be run):\n\n";

    const cl_device_id     listOfDevicesInContext[] = {devices[0]};   // specify list of devices used (based on their ID)

    cl_context context = clCreateContext(NULL,                        // cl_context_properties*
                                         1,                           // total number of devices we wish to use
                                         listOfDevicesInContext,      // specify the device IDs
                                         NULL,                        // no call-back function registered
                                         NULL,                        // no user data for the call-back function
                                         &err);                       // returns error code

In the above code, we have selected device[0] (the intel CPU) to be the only device in the context. We can easily add a different device or more than one device by changing the above parameters appropriately.

We can query the context to find out what devices are present. This is illustrated next:


    // How to query the context created above and get information about it

    // how to get the number of devices used in a context

    cl_uint getNumDevicesInContext;
    clGetContextInfo (context, CL_CONTEXT_NUM_DEVICES,     sizeof(cl_uint), &getNumDevicesInContext, NULL);

    // allocate an appropriately sized device list and get the IDs of the
    // devices used in the context

    cl_device_id* getContextDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * getNumDevicesInContext);
    clGetContextInfo (context, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * getNumDevicesInContext, getContextDevices, NULL);

    // figuring out how many times this context structure is accessed

    cl_uint       ref_count;
    clGetContextInfo (context, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count,              NULL);

    // printing information about the context

    std::cout << "CL_CONTEXT_NUM_DEVICES             " << getNumDevicesInContext << std::endl;
    std::cout << "CL_CONTEXT_DEVICES                 ";

    for(unsigned int i=0; i<getNumDevicesInContext; i++) {
        clGetDeviceInfo (getContextDevices[i],      CL_DEVICE_NAME, sizeof(name_data), name_data  , NULL);
        std::cout << name_data << "   ";
    }
                                                                    std::cout << std::endl;
    std::cout << "CL_CONTEXT_REFERENCE_COUNT         " << ref_count           << std::endl;

Now that we have a context, we are ready to run our kernel on it. But we first need to read in the kernel (remember we stored it in an external file called kernel_add.cl) and build an executable file. This is accomplished next:


    // Read kernel file into a buffer

    FILE *file_handle = fopen("kernel_add.cl", "r");  // create pointer to a file object
    fseek(file_handle, 0, SEEK_END);                  // move position to end of file
    size_t program_size = ftell(file_handle);         // get size of the kernel in bytes
    rewind(file_handle);

    char *program_buffer = (char*) malloc(program_size + 1);
    program_buffer[program_size] = '\0';

    fread(program_buffer, sizeof(char), program_size, file_handle);
    fclose(file_handle);

    // build the program

    cl_program program = clCreateProgramWithSource(context, 1, (const char**) &program_buffer, &program_size, &err);
    free(program_buffer);

    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    // create kernel

    cl_kernel kernel = clCreateKernel(program, "add", &err);

Next, we need to create what's called a command queue:


    // create a command queue

    cl_command_queue command_queue = clCreateCommandQueue(context,                   // specify context
                                                          getContextDevices[0],      // specify device
                                                          0,                         //
                                                          &err);                     //

We now come to the part where we allocate memory on the host for storing the input and output data and then initialize this data.


    // allocate memory buffers on the host

    const int N = 100;

    int *h_a = new int[N];   // vector "a"
    int *h_b = new int[N];   // vector "b"
    int *h_c = new int[N];   // c = a + b

    // fill input buffers on the host

    for(int i = 0; i < N; i++) {
        h_a[i] = i;
        h_b[i] = 2*i;
    }

The next step is to allocate space on the device for storing all the input and output data. Host data is then copied to the device global memory. When kernels execute, they access and work with data in device memory.


    // allocate memory buffers on the device

    cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY,  N * sizeof(int), NULL, &err);
    cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY,  N * sizeof(int), NULL, &err);
    cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(int), NULL, &err);

    // copy input buffers from host memory to device memory

    clEnqueueWriteBuffer(command_queue, d_a, CL_TRUE, 0, N * sizeof(int), h_a, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, d_b, CL_TRUE, 0, N * sizeof(int), h_b, 0, NULL, NULL);

Unlike CUDA, we need to explicitly set kernel function parameters:


    // set kernel function parameters

    clSetKernelArg(kernel, 0, sizeof(cl_int), &N  );    // parameter 0
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);    // parameter 1
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);    // parameter 2
    clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);    // parameter 3

We now specify launch parameters:

Note that in this case, we use 100 work-groups and 1 work-item per work-group. More typically, we will use several hundred work-items inside a single work-group. In that case, the total number of work-items in the kernel might be more than the actual size of the data.

The total number of work-items is always an integer multiple of the work-items per work-group.

This is why bound-checking inside kernels is so important. The extra work-items (if any) need to return without touching the data in global memory.

The kernel is launched using clEnqueueNDRangeKernel.


    // kernel launch parameters (work distribution)

    // Number of work items in each local work group

    size_t localSize = 1;

    // Number of total work items

    size_t globalSize = ceil(N/(float)localSize)*localSize;

    // print kernel launch parameters
    std::cout << std::endl;
    std::cout << "Kernel launch parameters:  localSize  = " << localSize << std::endl;
    std::cout << "                           globalSize = " << globalSize << std::endl;
    std::cout << std::endl;

    // execute OpenCL kernel

    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);

Finally, we copy results back to the host and perform clean-up.


    // copy results from device memory to host memory

    clEnqueueReadBuffer(command_queue, d_c, CL_TRUE, 0, N * sizeof(int), h_c, 0, NULL, NULL);

    // display result using the buffer on the host

    std::cout << std::endl;

    for(int i = 0; i < N; i++) {
        std::cout << "a = " << h_a[i] << " b = " << h_b[i]  << " c = a + b = " << h_c[i] << std::endl;
    }

    // Clean up

    err = clFlush(command_queue);
    err = clFinish(command_queue);

    err = clReleaseKernel(kernel);
    err = clReleaseProgram(program);

    err = clReleaseMemObject(d_a);
    err = clReleaseMemObject(d_b);
    err = clReleaseMemObject(d_c);

    err = clReleaseCommandQueue(command_queue);

    free(getContextDevices);
    free(devices);

    delete [] h_a;
    delete [] h_b;
    delete [] h_c;

    // main program ends

    return 0;
}

Let's summarize the flow of the above OpenCL code:

  1. Detect an OpenCL platform.
  2. Identify devices connected to that platform.
  3. Create a context - select and group devices on which our kernel will execute.
  4. Read in kernel code (typically from an external file) and build the executable.
  5. Create a command queue.
  6. Initialize data on the host (typically the CPU).
  7. Copy data to device (global) memory.
  8. Organize work-items for parallel execution inside the kernel.
  9. Launch kernel.
  10. Copy results back to the host.
  11. Clean up.

The code can be compiled using the following Makefile:


all:
    g++ vecAdd.cpp -o vecAdd.x -framework OpenCL

clean:
    rm *.x 

Here is the output after running the code:


Detected 1 platforms
Using OpenCL to get more information about the platform:

CL_PLATFORM_NAME       --- Apple
CL_PLATFORM_VENDOR     --- Apple
CL_PLATFORM_VERSION    --- OpenCL 1.2 (Oct  1 2013 18:21:51)
CL_PLATFORM_VERSION    --- FULL_PROFILE
CL_PLATFORM_EXTENSIONS --- cl_APPLE_SetMemObjectDestructor 
                           cl_APPLE_ContextLoggingFunctions 
                           cl_APPLE_clut 
                           cl_APPLE_query_kernel_names 
                           cl_APPLE_gl_sharing cl_khr_gl_event

Number of connected devices found = 3

Running OpenCL code to get device specifications:

-------------------------
CL_DEVICE_NAME                     Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      2.3 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          17.1799 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE      1024 work items
CL_DEVICE_LOCAL_MEM_SIZE           32768 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  1024 work items
                              : Y  1 work items
                              : Z  1 work items

-------------------------
CL_DEVICE_NAME                     GeForce GT 750M
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             32 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      0.925 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          2.14748 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE      1024 work items
CL_DEVICE_LOCAL_MEM_SIZE           49152 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  1024 work items
                              : Y  1024 work items
                              : Z  64 work items

-------------------------
CL_DEVICE_NAME                     Iris Pro
CL_DEVICE_AVAILABLE                1
CL_DEVICE_ADDRESS_BITS             64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY      1.2 MHz
CL_DEVICE_GLOBAL_MEM_SIZE          1.07374 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE      512 work items
CL_DEVICE_LOCAL_MEM_SIZE           65536 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X  512 work items
                              : Y  512 work items
                              : Z  512 work items

Creating a context (grouping devices on which kernels will be run):

CL_CONTEXT_NUM_DEVICES             1
CL_CONTEXT_DEVICES                 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
CL_CONTEXT_REFERENCE_COUNT         1


Kernel launch parameters:  localSize  = 1
                           globalSize = 100


a = 0 b = 0 c = a + b = 0
a = 1 b = 2 c = a + b = 3
a = 2 b = 4 c = a + b = 6
a = 3 b = 6 c = a + b = 9
a = 4 b = 8 c = a + b = 12
a = 5 b = 10 c = a + b = 15
a = 6 b = 12 c = a + b = 18
a = 7 b = 14 c = a + b = 21
a = 8 b = 16 c = a + b = 24
a = 9 b = 18 c = a + b = 27
a = 10 b = 20 c = a + b = 30
a = 11 b = 22 c = a + b = 33
a = 12 b = 24 c = a + b = 36
a = 13 b = 26 c = a + b = 39
a = 14 b = 28 c = a + b = 42
a = 15 b = 30 c = a + b = 45
a = 16 b = 32 c = a + b = 48
a = 17 b = 34 c = a + b = 51
a = 18 b = 36 c = a + b = 54
a = 19 b = 38 c = a + b = 57
a = 20 b = 40 c = a + b = 60
a = 21 b = 42 c = a + b = 63
a = 22 b = 44 c = a + b = 66
a = 23 b = 46 c = a + b = 69
a = 24 b = 48 c = a + b = 72
a = 25 b = 50 c = a + b = 75
a = 26 b = 52 c = a + b = 78
a = 27 b = 54 c = a + b = 81
a = 28 b = 56 c = a + b = 84
a = 29 b = 58 c = a + b = 87
a = 30 b = 60 c = a + b = 90
a = 31 b = 62 c = a + b = 93
a = 32 b = 64 c = a + b = 96
a = 33 b = 66 c = a + b = 99
a = 34 b = 68 c = a + b = 102
a = 35 b = 70 c = a + b = 105
a = 36 b = 72 c = a + b = 108
a = 37 b = 74 c = a + b = 111
a = 38 b = 76 c = a + b = 114
a = 39 b = 78 c = a + b = 117
a = 40 b = 80 c = a + b = 120
a = 41 b = 82 c = a + b = 123
a = 42 b = 84 c = a + b = 126
a = 43 b = 86 c = a + b = 129
a = 44 b = 88 c = a + b = 132
a = 45 b = 90 c = a + b = 135
a = 46 b = 92 c = a + b = 138
a = 47 b = 94 c = a + b = 141
a = 48 b = 96 c = a + b = 144
a = 49 b = 98 c = a + b = 147
a = 50 b = 100 c = a + b = 150
a = 51 b = 102 c = a + b = 153
a = 52 b = 104 c = a + b = 156
a = 53 b = 106 c = a + b = 159
a = 54 b = 108 c = a + b = 162
a = 55 b = 110 c = a + b = 165
a = 56 b = 112 c = a + b = 168
a = 57 b = 114 c = a + b = 171
a = 58 b = 116 c = a + b = 174
a = 59 b = 118 c = a + b = 177
a = 60 b = 120 c = a + b = 180
a = 61 b = 122 c = a + b = 183
a = 62 b = 124 c = a + b = 186
a = 63 b = 126 c = a + b = 189
a = 64 b = 128 c = a + b = 192
a = 65 b = 130 c = a + b = 195
a = 66 b = 132 c = a + b = 198
a = 67 b = 134 c = a + b = 201
a = 68 b = 136 c = a + b = 204
a = 69 b = 138 c = a + b = 207
a = 70 b = 140 c = a + b = 210
a = 71 b = 142 c = a + b = 213
a = 72 b = 144 c = a + b = 216
a = 73 b = 146 c = a + b = 219
a = 74 b = 148 c = a + b = 222
a = 75 b = 150 c = a + b = 225
a = 76 b = 152 c = a + b = 228
a = 77 b = 154 c = a + b = 231
a = 78 b = 156 c = a + b = 234
a = 79 b = 158 c = a + b = 237
a = 80 b = 160 c = a + b = 240
a = 81 b = 162 c = a + b = 243
a = 82 b = 164 c = a + b = 246
a = 83 b = 166 c = a + b = 249
a = 84 b = 168 c = a + b = 252
a = 85 b = 170 c = a + b = 255
a = 86 b = 172 c = a + b = 258
a = 87 b = 174 c = a + b = 261
a = 88 b = 176 c = a + b = 264
a = 89 b = 178 c = a + b = 267
a = 90 b = 180 c = a + b = 270
a = 91 b = 182 c = a + b = 273
a = 92 b = 184 c = a + b = 276
a = 93 b = 186 c = a + b = 279
a = 94 b = 188 c = a + b = 282
a = 95 b = 190 c = a + b = 285
a = 96 b = 192 c = a + b = 288
a = 97 b = 194 c = a + b = 291
a = 98 b = 196 c = a + b = 294
a = 99 b = 198 c = a + b = 297

Now let's have some fun! We have a working version of vector addition (N = 100), with the kernel currently executing on the CPU. How large can N be? What about the case when we execute the kernel on the NVIDIA GPU or on the Intel Iris Pro?