This document provides code for performing a parallel prefix sum (scan) operation on a list of values using CUDA. It defines a scan kernel that loads a block of the input list into shared memory, performs reductions within the block to calculate prefix sums, and writes the results to the output array. The main function sets up the input and output arrays on the host and device, launches the kernel to perform the scan, and checks the results.
Download as TXT, PDF, TXT or read online on Scribd
0 ratings0% found this document useful (0 votes)
211 views
Parallel Scan in C CUda
This document provides code for performing a parallel prefix sum (scan) operation on a list of values using CUDA. It defines a scan kernel that loads a block of the input list into shared memory, performs reductions within the block to calculate prefix sums, and writes the results to the output array. The main function sets up the input and output arrays on the host and device, launches the kernel to perform the scan, and checks the results.
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3
// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst [n-1]} #include
<wb.h>
#define BLOCK_SIZE 512 //@@ You can change this
#define wbCheck(stmt) do { cudaError_t err = stmt; if (err != cudaSuccess) { wbLog(ERROR, "Failed to run stmt ", #stmt); wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); return -1; } } while(0) __global__ void scan(float * input, float * output, int len) { //@@ Modify the body of this function to complete the functionality of //@@ the scan on the device //@@ You may need multiple kernel calls; write your kernels before this // Load a segment of the input vector into shared memory __shared__ float XY[2*BLOCK_SIZE]; unsigned int tx = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE; int i=blockDim.x*blockIdx.x+threadIdx.x; if (start + tx < len) XY[tx] = input[start + tx]; else XY[tx] = 0; if (start + BLOCK_SIZE + tx < len) XY[BLOCK_SIZE + tx] = input[start + BLOCK_SIZE + tx]; else XY[BLOCK_SIZE + tx] = 0; __syncthreads(); // Reduction for (int stride = 1; stride <= BLOCK_SIZE; stride <<= 1) { int index = (tx + 1) * stride * 2 - 1; if (index < 2 * BLOCK_SIZE) XY[index] += XY[index - stride]; __syncthreads(); } // Post reduction for (int stride = BLOCK_SIZE >> 1; stride>=1; stride >>= 1) { int index = (tx + 1) * stride * 2 - 1; if (index + stride < 2 * BLOCK_SIZE) XY[index + stride] += XY[index]; __syncthreads(); } /*if (start + t < len) output[start + t] = scan_array[t]; if (start + BLOCK_SIZE + t < len) output[start + BLOCK_SIZE + t] = XY[BLOCK_SIZE + t]; */ if (i<len) output[i] = XY[tx]; }
\ \ \ \ \ \ \
int main(int argc, char ** argv) {
wbArg_t args; float * hostInput; // The input 1D list float * hostOutput; // The output list float * deviceInput; float * deviceOutput; float * deviceOutputemp; int numElements; // number of elements in the list args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements); hostOutput = (float*) malloc(numElements * sizeof(float)); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The number of input elements in the input is ", numElements); wbTime_start(GPU, "Allocating GPU memory."); wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float))); wbCheck(cudaMalloc((void**)&deviceOutputemp, numElements*sizeof(float))); wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float))); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Clearing output memory."); wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float))); wbTime_stop(GPU, "Clearing output memory."); wbTime_start(GPU, "Copying input memory to the GPU."); wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMe mcpyHostToDevice)); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here int numOfBlocks =(numElements-1)/BLOCK_SIZE+1; dim3 gridSize(numOfBlocks); wbTime_start(Compute, "Performing CUDA computation"); //@@ Modify this to complete the functionality of the scan //@@ on the deivce scan<<<gridSize, BLOCK_SIZE>>>(deviceInput, deviceOutputemp, numElements ); cudaDeviceSynchronize(); scan<<<1, gridSize,numOfBlocks*sizeof(float)>>>(deviceOutputemp, deviceO utput, numOfBlocks); cudaDeviceSynchronize(); fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScannedArray, numElement s); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cuda MemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); cudaFree(deviceInput); cudaFree(deviceOutput); wbTime_stop(GPU, "Freeing GPU Memory");