Parallel Scan in C CUda

This document provides code for performing a parallel prefix sum (scan) operation on a list of values using CUDA. It defines a scan kernel that loads a block of the input list into shared memory, performs reductions within the block to calculate prefix sums, and writes the results to the output array. The main function sets up the input and output arrays on the host and device, launches the kernel to perform the scan, and checks the results.

Uploaded by

lloyd24390_874347375

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

211 views

Parallel Scan in C CUda

Uploaded by

lloyd24390_874347375

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 3

// MP Scan

// Given a list (lst) of length n

// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst
[n-1]}
#include

<wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {
cudaError_t err = stmt;
if (err != cudaSuccess) {
wbLog(ERROR, "Failed to run stmt ", #stmt);
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err));
return -1;
}
} while(0)
__global__ void scan(float * input, float * output, int len) {
//@@ Modify the body of this function to complete the functionality of
//@@ the scan on the device
//@@ You may need multiple kernel calls; write your kernels before this
// Load a segment of the input vector into shared memory
__shared__ float XY[2*BLOCK_SIZE];
unsigned int tx = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
int i=blockDim.x*blockIdx.x+threadIdx.x;
if (start + tx < len)
XY[tx] = input[start + tx];
else
XY[tx] = 0;
if (start + BLOCK_SIZE + tx < len)
XY[BLOCK_SIZE + tx] = input[start + BLOCK_SIZE + tx];
else
XY[BLOCK_SIZE + tx] = 0;
__syncthreads();
// Reduction
for (int stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
int index = (tx + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE)
XY[index] += XY[index - stride];
__syncthreads();
}
// Post reduction
for (int stride = BLOCK_SIZE >> 1; stride>=1; stride >>= 1) {
int index = (tx + 1) * stride * 2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
XY[index + stride] += XY[index];
__syncthreads();
}
/*if (start + t < len)
output[start + t] = scan_array[t];
if (start + BLOCK_SIZE + t < len)
output[start + BLOCK_SIZE + t] = XY[BLOCK_SIZE + t];
*/
if (i<len)
output[i] = XY[tx];
}

\
\
\
\
\
\
\

int main(int argc, char ** argv) {

wbArg_t args;
float * hostInput; // The input 1D list
float * hostOutput; // The output list
float * deviceInput;
float * deviceOutput;
float * deviceOutputemp;
int numElements; // number of elements in the list
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
hostOutput = (float*) malloc(numElements * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutputemp, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMe
mcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int numOfBlocks =(numElements-1)/BLOCK_SIZE+1;
dim3 gridSize(numOfBlocks);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Modify this to complete the functionality of the scan
//@@ on the deivce
scan<<<gridSize, BLOCK_SIZE>>>(deviceInput, deviceOutputemp, numElements
);
cudaDeviceSynchronize();
scan<<<1, gridSize,numOfBlocks*sizeof(float)>>>(deviceOutputemp, deviceO
utput, numOfBlocks);
cudaDeviceSynchronize();
fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScannedArray, numElement
s);
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cuda
MemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceInput);
cudaFree(deviceOutput);
wbTime_stop(GPU, "Freeing GPU Memory");

wbSolution(args, hostOutput, numElements);

free(hostInput);
free(hostOutput);
return 0;
}

Security Server Installation and Configuration Guide
No ratings yet
Security Server Installation and Configuration Guide
93 pages
Infinera Datasheet 7090 350 CEM Packet Transport Platform 0083 RevA 0519
100% (1)
Infinera Datasheet 7090 350 CEM Packet Transport Platform 0083 RevA 0519
3 pages
Camera Raw Cs3
No ratings yet
Camera Raw Cs3
2 pages
cuda
No ratings yet
cuda
4 pages
Cuda
No ratings yet
Cuda
7 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Rishi
No ratings yet
Rishi
30 pages
2023-CSC14120-Lecture01-CUDAIntroduction
No ratings yet
2023-CSC14120-Lecture01-CUDAIntroduction
32 pages
Cuda Notes From Udacity Lecture
No ratings yet
Cuda Notes From Udacity Lecture
3 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
7. Moving to Parallel - Addition of 2 Matrices
No ratings yet
7. Moving to Parallel - Addition of 2 Matrices
14 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
Web GPU
0% (1)
Web GPU
40 pages
Pgi Cuda Tutorial
No ratings yet
Pgi Cuda Tutorial
58 pages
5-computation
No ratings yet
5-computation
13 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
An Introduction To PyCUDA Using Prefix Sum Algorithm PDF
No ratings yet
An Introduction To PyCUDA Using Prefix Sum Algorithm PDF
6 pages
CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
DeviceFunc Cu
100% (1)
DeviceFunc Cu
1 page
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
No ratings yet
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
3 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
3-CUDA
No ratings yet
3-CUDA
5 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
01 Cuda c Basics
No ratings yet
01 Cuda c Basics
32 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
OpenCL Guide
No ratings yet
OpenCL Guide
19 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
WRF-GPU DR Young-Tae+Kim
No ratings yet
WRF-GPU DR Young-Tae+Kim
22 pages
cuuda nvidai guide_Part3
No ratings yet
cuuda nvidai guide_Part3
15 pages
cuda_mode_lecture2
No ratings yet
cuda_mode_lecture2
33 pages
CUDA
No ratings yet
CUDA
3 pages
Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
Using CUDA
No ratings yet
Using CUDA
57 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
2023 CSC14120 Lecture05 CUDAMemories
No ratings yet
2023 CSC14120 Lecture05 CUDAMemories
48 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
CUDA Tricks PDF
No ratings yet
CUDA Tricks PDF
33 pages
周03
No ratings yet
周03
65 pages
L06_GPGPU_CUDA_Programming_1
No ratings yet
L06_GPGPU_CUDA_Programming_1
23 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
作业2
No ratings yet
作业2
5 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
2020A7PS0143G FiboProcessor
No ratings yet
2020A7PS0143G FiboProcessor
10 pages
program For Projection of 3D Images
No ratings yet
program For Projection of 3D Images
16 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
No ratings yet
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
1 page
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
Run C
No ratings yet
Run C
18 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Win XP S - N
No ratings yet
Win XP S - N
10 pages
LSU EE 3755 - Fall 2012 - Computer Organization Verilog Notes 2 - Expressions
No ratings yet
LSU EE 3755 - Fall 2012 - Computer Organization Verilog Notes 2 - Expressions
22 pages
(MX) Reduced Flow Samples After Upgrading To Junos OS 15.1 - Juniper Networks
No ratings yet
(MX) Reduced Flow Samples After Upgrading To Junos OS 15.1 - Juniper Networks
3 pages
Intel CPU代码 (2016以前) PDF
0% (1)
Intel CPU代码 (2016以前) PDF
29 pages
FS Van 2024
No ratings yet
FS Van 2024
2 pages
Realities and Perils of Mobile Cloud Computing: Authors: Bheemappa.H Gururaj.P Secab.I.E.T Bijapur, Karnataka
No ratings yet
Realities and Perils of Mobile Cloud Computing: Authors: Bheemappa.H Gururaj.P Secab.I.E.T Bijapur, Karnataka
7 pages
Assignment 2 Specification 2
No ratings yet
Assignment 2 Specification 2
8 pages
BW CE Presentation
100% (1)
BW CE Presentation
16 pages
Business Supplies Profile
No ratings yet
Business Supplies Profile
22 pages
Linux Network Administration - Practicals - 1
No ratings yet
Linux Network Administration - Practicals - 1
6 pages
Q1-W3-Lesson4-Operating Systems-Lifeline of Computers
No ratings yet
Q1-W3-Lesson4-Operating Systems-Lifeline of Computers
4 pages
20741B - 06-Remote Access in Windows Server 2016
100% (1)
20741B - 06-Remote Access in Windows Server 2016
28 pages
Modicon M340 - BMXNOC0401
No ratings yet
Modicon M340 - BMXNOC0401
3 pages
8085 Microprocessor
No ratings yet
8085 Microprocessor
22 pages
Mod Menu Log - Com - Bandainamcoent.dblegends - WW
No ratings yet
Mod Menu Log - Com - Bandainamcoent.dblegends - WW
4 pages
ACI Virtualization Guide 60x Aci With Vmware Vds
No ratings yet
ACI Virtualization Guide 60x Aci With Vmware Vds
30 pages
84075-ABF White Paper
No ratings yet
84075-ABF White Paper
13 pages
CS 223 Comp Org and Assembly Lang
No ratings yet
CS 223 Comp Org and Assembly Lang
2 pages
Aca 1st Unit
No ratings yet
Aca 1st Unit
13 pages
F5923 3G Soho Mobile Router User Manual
No ratings yet
F5923 3G Soho Mobile Router User Manual
19 pages
Case Project 3
No ratings yet
Case Project 3
2 pages
An Introduction To Arp Spoofing Sean Whalen
No ratings yet
An Introduction To Arp Spoofing Sean Whalen
7 pages
Wes 7 Administratorsguideaug 2012 Vxlthinclients
No ratings yet
Wes 7 Administratorsguideaug 2012 Vxlthinclients
48 pages
Lecture 2 - Intro to Java
No ratings yet
Lecture 2 - Intro to Java
43 pages
WSN Question Bank CT1.docx - 18CSE451T - Wireless Sensor... : View Full Document
No ratings yet
WSN Question Bank CT1.docx - 18CSE451T - Wireless Sensor... : View Full Document
2 pages
Cloud Computing Lab
No ratings yet
Cloud Computing Lab
4 pages
ECC Exam RPS Guide 2021
No ratings yet
ECC Exam RPS Guide 2021
48 pages

Parallel Scan in C CUda

Uploaded by

Parallel Scan in C CUda

Uploaded by

// MP Scan

// Given a list (lst) of length n

#define BLOCK_SIZE 512 //@@ You can change this

int main(int argc, char ** argv) {

wbSolution(args, hostOutput, numElements);

You might also like