#include "cuda_runtime.h" #include "device_launch_parameters.h" #include __global__ void addKernel(int* c, const int* a, const int* b, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { c[i] = a[i] + b[i]; } } // Helper function for using CUDA to add vectors in parallel. void addWithCuda(int* c, const int* a, const int* b, int size) { int* dev_a = nullptr; int* dev_b = nullptr; int* dev_c = nullptr; // Allocate GPU buffers for three vectors (two input, one output) cudaMalloc((void**)&dev_c, size * sizeof(int)); cudaMalloc((void**)&dev_a, size * sizeof(int)); cudaMalloc((void**)&dev_b, size * sizeof(int)); // Copy input vectors from host memory to GPU buffers. cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); // Launch a kernel on the GPU with one thread for each element. // 2 is number of computational blocks and (size + 1) / 2 is a number of threads in a block addKernel<<<2, (size + 1) / 2>>>(dev_c, dev_a, dev_b, size); // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaDeviceSynchronize(); // Copy output vector from GPU buffer to host memory. cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); } int main(int argc, char** argv) { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; addWithCuda(c, a, b, arraySize); printf("{1, 2, 3, 4, 5} + {10, 20, 30, 40, 50} = {%d, %d, %d, %d, %d}\n", c[0], c[1], c[2], c[3], c[4]); cudaDeviceReset(); return 0; }