Quoting from the Khronos Group website :
OpenCL is the first open, royalty-free standard for cross-platform, parallel programming of modern processors found in personal computers, servers and handheld/embedded devices. OpenCL (Open Computing Language) greatly improves speed and responsiveness for a wide spectrum of applications in numerous market categories from gaming and entertainment to scientific and medical software. |
To put it more simply, OpenCL is a cross-platform alternative to CUDA.
OpenCL is truly the language for heterogeneous computing. In these tutorials, we will explore this programming model in depth. Most of the initial code examples will be very similar to those presented in my tutorials about CUDA. As you will discover soon, the underlying concepts in both CUDA and OpenCL are very similar, but are called by different names.
Unlike CUDA, where "device" typically refers to a GPU, OpenCL devices include almost anything that can be used for computing. Here are the various options that can be specified when searching for devices:
CL_DEVICE_TYPE_DEFAULT CL_DEVICE_TYPE_CPU CL_DEVICE_TYPE_GPU CL_DEVICE_TYPE_ACCELERATOR CL_DEVICE_TYPE_ALL |
Anytime you encounter a new platform on which you plan to develop applications using OpenCL, you will need to find out how many devices you have and their specifications. The code below answers these questions.
Create a C++ file called deviceInfo.cpp and copy the following code into this file.
#include <iostream>
#include <OpenCL/cl.h>
int main(void)
{
// define host and device data structures
cl_platform_id platform;
cl_device_id *devices;
cl_uint num_devices;
// Identify a platform
cl_uint num_entries = 2; // maximum number of platforms we are interested in detecting
cl_uint num_platforms = 0; // number of platforms actually found during run-time
cl_int err = clGetPlatformIDs(num_entries, &platform, &num_platforms);
if(err < 0) {
std::cout << "Couldn't find any platforms\n";
exit(1);
}
std::cout << "Detected " << num_platforms << " platforms\n";
std::cout << "Using OpenCL to get more information about the platform:\n\n";
char pform_name [40]; // platform name
char pform_vendor[40]; // platform vendor
char pform_version[40]; // platform version
char pform_profile[40]; // platform profile
char pform_extensions[4096]; // platform extensions
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pform_name ), &pform_name, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pform_vendor), &pform_vendor, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pform_version), &pform_version, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, sizeof(pform_profile), &pform_profile, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, sizeof(pform_extensions), &pform_extensions, NULL);
std::cout << "CL_PLATFORM_NAME --- " << pform_name << std::endl;
std::cout << "CL_PLATFORM_VENDOR --- " << pform_vendor << std::endl;
std::cout << "CL_PLATFORM_VERSION --- " << pform_version << std::endl;
std::cout << "CL_PLATFORM_PROFILE --- " << pform_profile << std::endl;
std::cout << "CL_PLATFORM_EXTENSIONS --- " << pform_extensions << std::endl;
// Determine number of connected devices
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
// Access connected devices
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
// Extension data
char ext_data[4096];
char name_data[48];
cl_bool deviceAvl;
cl_uint deviceAdd;
cl_uint frequency;
cl_ulong globalMem;
cl_ulong blockSize;
cl_ulong sharedMem;
cl_uint witemDims;
cl_ulong xyzDims[3];
char deviceExt[4096];
// Obtain specifications for each connected device
std::cout << std::endl;
std::cout << "Running OpenCL code to get device specifications:\n\n";
for(unsigned int i=0; i<num_devices; i++) {
clGetDeviceInfo (devices[i], CL_DEVICE_NAME , sizeof(name_data), name_data , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_AVAILABLE , sizeof(ext_data) , &deviceAvl , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_ADDRESS_BITS , sizeof(ext_data) , &deviceAdd , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY , sizeof(ext_data) , &frequency , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_GLOBAL_MEM_SIZE , sizeof(ext_data) , &globalMem , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof(ext_data) , &blockSize , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_LOCAL_MEM_SIZE , sizeof(ext_data) , &sharedMem , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS , sizeof(ext_data) , &witemDims , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES , sizeof(ext_data) , &xyzDims[0], NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_EXTENSIONS , sizeof(deviceExt), deviceExt , NULL);
std::cout << "------------------------------------------------------------------------" << std::endl;
std::cout << "CL_DEVICE_NAME " << name_data << std::endl;
std::cout << "CL_DEVICE_AVAILABLE " << deviceAvl << std::endl;
std::cout << "CL_DEVICE_ADDRESS_BITS " << deviceAdd << " bits" << std::endl;
std::cout << "CL_DEVICE_MAX_CLOCK_FREQUENCY " << (float) frequency/1000 << " MHz" << std::endl;
std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE " << (double) globalMem/1E9 << " GB " << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE " << blockSize << " work items" << std::endl;
std::cout << "CL_DEVICE_LOCAL_MEM_SIZE " << sharedMem << " Bytes" << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS " << witemDims << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_ITEM_SIZES : X " << xyzDims[0] << " work items" << std::endl;
std::cout << " : Y " << xyzDims[1] << " work items" << std::endl;
std::cout << " : Z " << xyzDims[2] << " work items" << std::endl;
std::cout << "CL_DEVICE_EXTENSIONS " << deviceExt << std::endl;
std::cout << std::endl;
}
free(devices);
return 0;
}
Use the following Makefile to compile the above code:
all:
g++ -framework OpenCL deviceInfo.cpp -o deviceInfo.x
clean:
rm *.x
On my 2013 Macbook Pro, I get the following result:
bash-3.2$ ./deviceInfo.x Detected 1 platforms Using OpenCL to get more information about the platform: CL_PLATFORM_NAME --- Apple CL_PLATFORM_VENDOR --- Apple CL_PLATFORM_VERSION --- OpenCL 1.2 (Oct 1 2013 18:21:51) CL_PLATFORM_PROFILE --- FULL_PROFILE CL_PLATFORM_EXTENSIONS --- cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event Running OpenCL code to get device specifications: ----------------------------------------------------------------------------- CL_DEVICE_NAME Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz CL_DEVICE_AVAILABLE 1 CL_DEVICE_ADDRESS_BITS 64 bits CL_DEVICE_MAX_CLOCK_FREQUENCY 2.3 MHz CL_DEVICE_GLOBAL_MEM_SIZE 17.1799 GB CL_DEVICE_MAX_WORK_GROUP_SIZE 1024 work items CL_DEVICE_LOCAL_MEM_SIZE 32768 Bytes CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3 CL_DEVICE_MAX_WORK_ITEM_SIZES : X 1024 work items : Y 1 work items : Z 1 work items CL_DEVICE_EXTENSIONS cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_3d_image_writes cl_khr_image2d_from_buffer cl_APPLE_fp64_basic_ops cl_APPLE_fixed_alpha_channel_orders cl_APPLE_biased_fixed_point_image_formats cl_APPLE_command_queue_priority ----------------------------------------------------------------------------- CL_DEVICE_NAME GeForce GT 750M CL_DEVICE_AVAILABLE 1 CL_DEVICE_ADDRESS_BITS 32 bits CL_DEVICE_MAX_CLOCK_FREQUENCY 0.925 MHz CL_DEVICE_GLOBAL_MEM_SIZE 2.14748 GB CL_DEVICE_MAX_WORK_GROUP_SIZE 1024 work items CL_DEVICE_LOCAL_MEM_SIZE 49152 Bytes CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3 CL_DEVICE_MAX_WORK_ITEM_SIZES : X 1024 work items : Y 1024 work items : Z 64 work items CL_DEVICE_EXTENSIONS cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_APPLE_fp64_basic_ops cl_khr_fp64 cl_khr_3d_image_writes cl_khr_depth_images cl_khr_gl_depth_images cl_khr_gl_msaa_sharing cl_khr_image2d_from_buffer ----------------------------------------------------------------------------- CL_DEVICE_NAME Iris Pro CL_DEVICE_AVAILABLE 1 CL_DEVICE_ADDRESS_BITS 64 bits CL_DEVICE_MAX_CLOCK_FREQUENCY 1.2 MHz CL_DEVICE_GLOBAL_MEM_SIZE 1.07374 GB CL_DEVICE_MAX_WORK_GROUP_SIZE 512 work items CL_DEVICE_LOCAL_MEM_SIZE 65536 Bytes CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3 CL_DEVICE_MAX_WORK_ITEM_SIZES : X 512 work items : Y 512 work items : Z 512 work items CL_DEVICE_EXTENSIONS cl_APPLE_SetMemObjectDestructor cl_APPLE_ContextLoggingFunctions cl_APPLE_clut cl_APPLE_query_kernel_names cl_APPLE_gl_sharing cl_khr_gl_event cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_image2d_from_buffer cl_khr_gl_depth_images cl_khr_depth_images
Terribly sorry for bombarding you with the extensions there, but I wanted to give it exactly like it is.
Before moving on, let's become familiar with some OpenCL terminology. The definitions below are taken from the OpenCL-2.0 Specification, released on 14 November 2013.
|
For the above result, we have one platform and three OpenCL devices:
We have not really defined a context in the code above, because we did not write and execute any kernels so far. This will be the subject of tutorial 002.
The code in tutorial 001 will now be extended to implement vector addition. Let us begin by presenting the kernel code. Create a file called kernel_add.cl and copy the following code there:
// OpenCL kernel for vector addition
__kernel void add( const int N, // number of elements in the vector
__global int *a, // input vector 1
__global int *b, // input vector 2
__global int *c) // output vector
{
// get global location of this work-item
int index = get_global_id(0);
// perform addition with bound checking
if (index < N)
c[index] = a[index] + b[index];
}
Kernel files in OpenCL are typically given the extension .cl and are stored separately from the host code.
The get_global_id(0) function determines the global index of this particular kernel instance, and is used to operate on that particular element in the array.
In tutorial 001, we found that there were three devices attached to our platform. We now have complete freedom to pick and choose which device or set of devices we want to execute our kernel on. This is referred to as creating a context and is done within the host code.
Create the host code for this project ( vecAdd.cpp) and copy the following code there. Admittedly, OpenCL can get quite verbose compared to CUDA because of all the boilerplate code required to find platforms and connect devices. This is mainly because CUDA does several of these things behind the scenes and thus hides this complexity from the end user.
#include <iostream>
#include <cmath>
#include <OpenCL/cl.h>
int main(void)
{
// Identify a platform
cl_platform_id platform;
cl_uint num_entries = 1; // maximum number of platforms we are interested in detecting
cl_uint num_platforms = 0; // number of platforms actually found during run-time
cl_int err = clGetPlatformIDs(num_entries, &platform, &num_platforms);
// error check
if(err < 0) {
std::cout << "Couldn't find any platforms\n";
exit(1);
}
std::cout << "Detected " << num_platforms << " platforms\n";
std::cout << "Using OpenCL to get more information about the platform:\n\n";
char pform_name [40]; // platform name
char pform_vendor[40]; // platform vendor
char pform_version[40]; // platform version
char pform_profile[40]; // platform profile
char pform_extensions[4096]; // platform extensions
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pform_name ), &pform_name, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pform_vendor), &pform_vendor, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pform_version), &pform_version, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, sizeof(pform_profile), &pform_profile, NULL);
clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, sizeof(pform_extensions), &pform_extensions, NULL);
std::cout << "CL_PLATFORM_NAME --- " << pform_name << std::endl;
std::cout << "CL_PLATFORM_VENDOR --- " << pform_vendor << std::endl;
std::cout << "CL_PLATFORM_VERSION --- " << pform_version << std::endl;
std::cout << "CL_PLATFORM_VERSION --- " << pform_profile << std::endl;
std::cout << "CL_PLATFORM_EXTENSIONS --- " << pform_extensions << std::endl;
// Determine number of devices connected to this platform
cl_uint numOfDevices;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, NULL, &numOfDevices);
// error check
if(err < 0) {
std::cout << "Couldn't find any devices\n";
exit(1);
}
std::cout << "\nNumber of connected devices found = " << numOfDevices << std::endl;
// allocate memory to store devices
cl_device_id* devices = (cl_device_id*) malloc(sizeof(cl_device_id) * numOfDevices);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
// Extension data
char ext_data[4096];
char name_data[48];
cl_bool deviceAvl;
cl_uint deviceAdd;
cl_uint frequency;
cl_ulong globalMem;
cl_ulong blockSize;
cl_ulong sharedMem;
cl_uint witemDims;
cl_ulong xyzDims[3];
// Obtain specifications for each connected GPU
std::cout << std::endl;
std::cout << "Running OpenCL code to get device specifications:\n\n";
for(unsigned int i=0; i<numOfDevices; i++) {
clGetDeviceInfo (devices[i], CL_DEVICE_NAME , sizeof(name_data), name_data , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_AVAILABLE , sizeof(ext_data) , &deviceAvl , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_ADDRESS_BITS , sizeof(ext_data) , &deviceAdd , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY , sizeof(ext_data) , &frequency , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_GLOBAL_MEM_SIZE , sizeof(ext_data) , &globalMem , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof(ext_data) , &blockSize , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_LOCAL_MEM_SIZE , sizeof(ext_data) , &sharedMem , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS , sizeof(ext_data) , &witemDims , NULL);
clGetDeviceInfo (devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES , sizeof(ext_data) , &xyzDims[0], NULL);
std::cout << "-------------------------" << std::endl;
std::cout << "CL_DEVICE_NAME " << name_data << std::endl;
std::cout << "CL_DEVICE_AVAILABLE " << deviceAvl << std::endl;
std::cout << "CL_DEVICE_ADDRESS_BITS " << deviceAdd << " bits" << std::endl;
std::cout << "CL_DEVICE_MAX_CLOCK_FREQUENCY " << (float) frequency/1000 << " MHz" << std::endl;
std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE " << (double) globalMem/1E9 << " GB " << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE " << blockSize << " work items" << std::endl;
std::cout << "CL_DEVICE_LOCAL_MEM_SIZE " << sharedMem << " Bytes" << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS " << witemDims << std::endl;
std::cout << "CL_DEVICE_MAX_WORK_ITEM_SIZES : X " << xyzDims[0] << " work items" << std::endl;
std::cout << " : Y " << xyzDims[1] << " work items" << std::endl;
std::cout << " : Z " << xyzDims[2] << " work items" << std::endl;
std::cout << std::endl;
}
So far, the code is pretty much identical to tutorial 001. We have identified a platform and have identified and printed out information about all devices attached to it.
Next, we select devices and create a context in which our kernel will execute.
// create a context that uses some or all available devices
std::cout << "\nCreating a context (grouping devices on which kernels will be run):\n\n";
const cl_device_id listOfDevicesInContext[] = {devices[0]}; // specify list of devices used (based on their ID)
cl_context context = clCreateContext(NULL, // cl_context_properties*
1, // total number of devices we wish to use
listOfDevicesInContext, // specify the device IDs
NULL, // no call-back function registered
NULL, // no user data for the call-back function
&err); // returns error code
In the above code, we have selected device[0] (the intel CPU) to be the only device in the context. We can easily add a different device or more than one device by changing the above parameters appropriately.
We can query the context to find out what devices are present. This is illustrated next:
// How to query the context created above and get information about it
// how to get the number of devices used in a context
cl_uint getNumDevicesInContext;
clGetContextInfo (context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &getNumDevicesInContext, NULL);
// allocate an appropriately sized device list and get the IDs of the
// devices used in the context
cl_device_id* getContextDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * getNumDevicesInContext);
clGetContextInfo (context, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * getNumDevicesInContext, getContextDevices, NULL);
// figuring out how many times this context structure is accessed
cl_uint ref_count;
clGetContextInfo (context, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count, NULL);
// printing information about the context
std::cout << "CL_CONTEXT_NUM_DEVICES " << getNumDevicesInContext << std::endl;
std::cout << "CL_CONTEXT_DEVICES ";
for(unsigned int i=0; i<getNumDevicesInContext; i++) {
clGetDeviceInfo (getContextDevices[i], CL_DEVICE_NAME, sizeof(name_data), name_data , NULL);
std::cout << name_data << " ";
}
std::cout << std::endl;
std::cout << "CL_CONTEXT_REFERENCE_COUNT " << ref_count << std::endl;
Now that we have a context, we are ready to run our kernel on it. But we first need to read in the kernel (remember we stored it in an external file called kernel_add.cl) and build an executable file. This is accomplished next:
// Read kernel file into a buffer
FILE *file_handle = fopen("kernel_add.cl", "r"); // create pointer to a file object
fseek(file_handle, 0, SEEK_END); // move position to end of file
size_t program_size = ftell(file_handle); // get size of the kernel in bytes
rewind(file_handle);
char *program_buffer = (char*) malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, file_handle);
fclose(file_handle);
// build the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**) &program_buffer, &program_size, &err);
free(program_buffer);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// create kernel
cl_kernel kernel = clCreateKernel(program, "add", &err);
Next, we need to create what's called a command queue:
// create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, // specify context
getContextDevices[0], // specify device
0, //
&err); //
We now come to the part where we allocate memory on the host for storing the input and output data and then initialize this data.
// allocate memory buffers on the host
const int N = 100;
int *h_a = new int[N]; // vector "a"
int *h_b = new int[N]; // vector "b"
int *h_c = new int[N]; // c = a + b
// fill input buffers on the host
for(int i = 0; i < N; i++) {
h_a[i] = i;
h_b[i] = 2*i;
}
The next step is to allocate space on the device for storing all the input and output data. Host data is then copied to the device global memory. When kernels execute, they access and work with data in device memory.
// allocate memory buffers on the device
cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(int), NULL, &err);
cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(int), NULL, &err);
cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(int), NULL, &err);
// copy input buffers from host memory to device memory
clEnqueueWriteBuffer(command_queue, d_a, CL_TRUE, 0, N * sizeof(int), h_a, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, d_b, CL_TRUE, 0, N * sizeof(int), h_b, 0, NULL, NULL);
Unlike CUDA, we need to explicitly set kernel function parameters:
// set kernel function parameters
clSetKernelArg(kernel, 0, sizeof(cl_int), &N ); // parameter 0
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); // parameter 1
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); // parameter 2
clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); // parameter 3
We now specify launch parameters:
Note that in this case, we use 100 work-groups and 1 work-item per work-group. More typically, we will use several hundred work-items inside a single work-group. In that case, the total number of work-items in the kernel might be more than the actual size of the data.
The total number of work-items is always an integer multiple of the work-items per work-group.
This is why bound-checking inside kernels is so important. The extra work-items (if any) need to return without touching the data in global memory.
The kernel is launched using clEnqueueNDRangeKernel.
// kernel launch parameters (work distribution)
// Number of work items in each local work group
size_t localSize = 1;
// Number of total work items
size_t globalSize = ceil(N/(float)localSize)*localSize;
// print kernel launch parameters
std::cout << std::endl;
std::cout << "Kernel launch parameters: localSize = " << localSize << std::endl;
std::cout << " globalSize = " << globalSize << std::endl;
std::cout << std::endl;
// execute OpenCL kernel
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
Finally, we copy results back to the host and perform clean-up.
// copy results from device memory to host memory
clEnqueueReadBuffer(command_queue, d_c, CL_TRUE, 0, N * sizeof(int), h_c, 0, NULL, NULL);
// display result using the buffer on the host
std::cout << std::endl;
for(int i = 0; i < N; i++) {
std::cout << "a = " << h_a[i] << " b = " << h_b[i] << " c = a + b = " << h_c[i] << std::endl;
}
// Clean up
err = clFlush(command_queue);
err = clFinish(command_queue);
err = clReleaseKernel(kernel);
err = clReleaseProgram(program);
err = clReleaseMemObject(d_a);
err = clReleaseMemObject(d_b);
err = clReleaseMemObject(d_c);
err = clReleaseCommandQueue(command_queue);
free(getContextDevices);
free(devices);
delete [] h_a;
delete [] h_b;
delete [] h_c;
// main program ends
return 0;
}
Let's summarize the flow of the above OpenCL code:
The code can be compiled using the following Makefile:
all:
g++ vecAdd.cpp -o vecAdd.x -framework OpenCL
clean:
rm *.x
Here is the output after running the code:
Detected 1 platforms
Using OpenCL to get more information about the platform:
CL_PLATFORM_NAME --- Apple
CL_PLATFORM_VENDOR --- Apple
CL_PLATFORM_VERSION --- OpenCL 1.2 (Oct 1 2013 18:21:51)
CL_PLATFORM_VERSION --- FULL_PROFILE
CL_PLATFORM_EXTENSIONS --- cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing cl_khr_gl_event
Number of connected devices found = 3
Running OpenCL code to get device specifications:
-------------------------
CL_DEVICE_NAME Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
CL_DEVICE_AVAILABLE 1
CL_DEVICE_ADDRESS_BITS 64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY 2.3 MHz
CL_DEVICE_GLOBAL_MEM_SIZE 17.1799 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE 1024 work items
CL_DEVICE_LOCAL_MEM_SIZE 32768 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X 1024 work items
: Y 1 work items
: Z 1 work items
-------------------------
CL_DEVICE_NAME GeForce GT 750M
CL_DEVICE_AVAILABLE 1
CL_DEVICE_ADDRESS_BITS 32 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY 0.925 MHz
CL_DEVICE_GLOBAL_MEM_SIZE 2.14748 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE 1024 work items
CL_DEVICE_LOCAL_MEM_SIZE 49152 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X 1024 work items
: Y 1024 work items
: Z 64 work items
-------------------------
CL_DEVICE_NAME Iris Pro
CL_DEVICE_AVAILABLE 1
CL_DEVICE_ADDRESS_BITS 64 bits
CL_DEVICE_MAX_CLOCK_FREQUENCY 1.2 MHz
CL_DEVICE_GLOBAL_MEM_SIZE 1.07374 GB
CL_DEVICE_MAX_WORK_GROUP_SIZE 512 work items
CL_DEVICE_LOCAL_MEM_SIZE 65536 Bytes
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 3
CL_DEVICE_MAX_WORK_ITEM_SIZES : X 512 work items
: Y 512 work items
: Z 512 work items
Creating a context (grouping devices on which kernels will be run):
CL_CONTEXT_NUM_DEVICES 1
CL_CONTEXT_DEVICES Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
CL_CONTEXT_REFERENCE_COUNT 1
Kernel launch parameters: localSize = 1
globalSize = 100
a = 0 b = 0 c = a + b = 0
a = 1 b = 2 c = a + b = 3
a = 2 b = 4 c = a + b = 6
a = 3 b = 6 c = a + b = 9
a = 4 b = 8 c = a + b = 12
a = 5 b = 10 c = a + b = 15
a = 6 b = 12 c = a + b = 18
a = 7 b = 14 c = a + b = 21
a = 8 b = 16 c = a + b = 24
a = 9 b = 18 c = a + b = 27
a = 10 b = 20 c = a + b = 30
a = 11 b = 22 c = a + b = 33
a = 12 b = 24 c = a + b = 36
a = 13 b = 26 c = a + b = 39
a = 14 b = 28 c = a + b = 42
a = 15 b = 30 c = a + b = 45
a = 16 b = 32 c = a + b = 48
a = 17 b = 34 c = a + b = 51
a = 18 b = 36 c = a + b = 54
a = 19 b = 38 c = a + b = 57
a = 20 b = 40 c = a + b = 60
a = 21 b = 42 c = a + b = 63
a = 22 b = 44 c = a + b = 66
a = 23 b = 46 c = a + b = 69
a = 24 b = 48 c = a + b = 72
a = 25 b = 50 c = a + b = 75
a = 26 b = 52 c = a + b = 78
a = 27 b = 54 c = a + b = 81
a = 28 b = 56 c = a + b = 84
a = 29 b = 58 c = a + b = 87
a = 30 b = 60 c = a + b = 90
a = 31 b = 62 c = a + b = 93
a = 32 b = 64 c = a + b = 96
a = 33 b = 66 c = a + b = 99
a = 34 b = 68 c = a + b = 102
a = 35 b = 70 c = a + b = 105
a = 36 b = 72 c = a + b = 108
a = 37 b = 74 c = a + b = 111
a = 38 b = 76 c = a + b = 114
a = 39 b = 78 c = a + b = 117
a = 40 b = 80 c = a + b = 120
a = 41 b = 82 c = a + b = 123
a = 42 b = 84 c = a + b = 126
a = 43 b = 86 c = a + b = 129
a = 44 b = 88 c = a + b = 132
a = 45 b = 90 c = a + b = 135
a = 46 b = 92 c = a + b = 138
a = 47 b = 94 c = a + b = 141
a = 48 b = 96 c = a + b = 144
a = 49 b = 98 c = a + b = 147
a = 50 b = 100 c = a + b = 150
a = 51 b = 102 c = a + b = 153
a = 52 b = 104 c = a + b = 156
a = 53 b = 106 c = a + b = 159
a = 54 b = 108 c = a + b = 162
a = 55 b = 110 c = a + b = 165
a = 56 b = 112 c = a + b = 168
a = 57 b = 114 c = a + b = 171
a = 58 b = 116 c = a + b = 174
a = 59 b = 118 c = a + b = 177
a = 60 b = 120 c = a + b = 180
a = 61 b = 122 c = a + b = 183
a = 62 b = 124 c = a + b = 186
a = 63 b = 126 c = a + b = 189
a = 64 b = 128 c = a + b = 192
a = 65 b = 130 c = a + b = 195
a = 66 b = 132 c = a + b = 198
a = 67 b = 134 c = a + b = 201
a = 68 b = 136 c = a + b = 204
a = 69 b = 138 c = a + b = 207
a = 70 b = 140 c = a + b = 210
a = 71 b = 142 c = a + b = 213
a = 72 b = 144 c = a + b = 216
a = 73 b = 146 c = a + b = 219
a = 74 b = 148 c = a + b = 222
a = 75 b = 150 c = a + b = 225
a = 76 b = 152 c = a + b = 228
a = 77 b = 154 c = a + b = 231
a = 78 b = 156 c = a + b = 234
a = 79 b = 158 c = a + b = 237
a = 80 b = 160 c = a + b = 240
a = 81 b = 162 c = a + b = 243
a = 82 b = 164 c = a + b = 246
a = 83 b = 166 c = a + b = 249
a = 84 b = 168 c = a + b = 252
a = 85 b = 170 c = a + b = 255
a = 86 b = 172 c = a + b = 258
a = 87 b = 174 c = a + b = 261
a = 88 b = 176 c = a + b = 264
a = 89 b = 178 c = a + b = 267
a = 90 b = 180 c = a + b = 270
a = 91 b = 182 c = a + b = 273
a = 92 b = 184 c = a + b = 276
a = 93 b = 186 c = a + b = 279
a = 94 b = 188 c = a + b = 282
a = 95 b = 190 c = a + b = 285
a = 96 b = 192 c = a + b = 288
a = 97 b = 194 c = a + b = 291
a = 98 b = 196 c = a + b = 294
a = 99 b = 198 c = a + b = 297
Now let's have some fun! We have a working version of vector addition (N = 100), with the kernel currently executing on the CPU. How large can N be? What about the case when we execute the kernel on the NVIDIA GPU or on the Intel Iris Pro?