0% found this document useful (0 votes)

6 views

2023-CSC14120-Lecture01-CUDAIntroduction

Uploaded by

trần văn quyết

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

6 views

2023-CSC14120-Lecture01-CUDAIntroduction

Uploaded by

trần văn quyết

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 32

Parallel Programming

Introduction to CUDA C/C++

Part I

Phạm Trọng Nghĩa

ptnghia@fit.hcmus.edu.vn
Data parallelism
• Question: Why modern software applications run slowly?
• Answer: too much data to process
• Image-processing apps: million to trillions of pixels
• Molecular dynamics apps: Thousands to billions of atoms
• Organizing the computation around the data such that we
can execute the resulting independent computations in
parallel to complete the overall job faster—often much faster.

2
CUDA C/C++: is extended-C/C++, allows us to write a program running
on both CPU (sequential parts) and GPU (massively parallel parts)

#include <iostream>
#include <algorithm>

using namespace std;

#define N 1024
#define RADIUS 3
#define BLOCK_SIZE 16

global void stencil_1d(int in, int out) {

__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
Host = CPU
(+ memory)
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;

// Read input elements into shared memory

temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS]; Host chay tuan tu
parallel function
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}

// Synchronize (ensure all the data is available)

__syncthreads();

// Apply the stencil

int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];

Device = GPU
// Store the result
out[gindex] = result;
}

void fill_ints(int *x, int n) {

fill_n(x, n, 1);
}

int main(void) {
Device chay song song
int *in, *out; // host copies of a, b, c
int *d_in, *d_out; // device copies of a, b, c
int size = (N + 2*RADIUS) * sizeof(int);

// Alloc space for host copies and setup values

in = (int *)malloc(size); fill_ints(in, N + 2*RADIUS);

serial code
out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);

// Alloc space for device copies

cudaMalloc((void **)&d_in, size);
cudaMalloc((void **)&d_out, size);

// Copy to device
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);

parallel code
// Launch stencil_1d() kernel on GPU
stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS,
d_out + RADIUS);

// Copy result back to host

serial code
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);

// Cleanup
free(in); free(out);
cudaFree(d_in); cudaFree(d_out);
return 0;
}

Image source: NVIDIA. CUDA C/C++ Basics

3
A simple CUDA program:
adding 2 vectors
• Adding 2 vectors sequentially using host
• Adding 2 vectors in parallel using device: each thread on device are
responsible for computing an element in the sum vector, and all these
threads run in parallel
• Who win?

Image source: NVIDIA. CUDA C/C++ Basics

4
int main(int argc, char **argv)
{
int n; // Vector size
float *in1, *in2; // Input vectors
float *out; // Output vector

// Input data into n

...

// Allocate memories for in1, in2, out

...

// Input data into in1, in2

...
void addVecOnHost(float* in1, float* in2, float* out, int n)
// Add vectors (on host) {
addVecOnHost(in1, in2, out, n); for (int i = 0; i < n; i++)
out[i] = in1[i] + in2[i];
// Free memories }
...

return 0;
}
5
int main(int argc, char **argv)
{
int n; // Vector size
float *in1, *in2; // Input vectors
float *out; // Output vector

// Input data into n

... // Host allocates memories on device
...
// Allocate memories for in1, in2, out
... // Host copies data to device memories
...
// Input data into in1, in2
... // Host invokes kernel function to add vectors
on device
// Add vectors (on host) ...
addVecOnHost(in1, in2, out, n);
// Host copies result from device memory
// Free memories ...
...
// Host frees device memories
return 0; ...
} Image source: Mark Harris. Unified Memory in CUDA 6
6
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
cudaMalloc(&d_in1, n * sizeof(float));
cudaMalloc(&d_in2, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));

// Host copies data to device memories

...

// Host invokes kernel function to add vectors on device

...

// Host copies result from device memory

...

// Host frees device memories

...

7
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
cudaMalloc(&d_in1, n * sizeof(float));
cudaMalloc(&d_in2, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));

// Host copies data to device memories

cudaMemcpy(d_in1, in1, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, n * sizeof(float), cudaMemcpyHostToDevice);

// Host invokes kernel function to add vectors on device

...

// Host copies result from device memory

...

// Host frees device memories

...

8
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
cudaMalloc(&d_in1, n * sizeof(float));
cudaMalloc(&d_in2, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));

// Host copies data to device memories

cudaMemcpy(d_in1, in1, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, n * sizeof(float), cudaMemcpyHostToDevice);

// Host invokes kernel function to add vectors on device

...

// Host copies result from device memory

cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);

// Host frees device memories

...

9
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
cudaMalloc(&d_in1, n * sizeof(float));
cudaMalloc(&d_in2, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));

// Host copies data to device memories

cudaMemcpy(d_in1, in1, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, n * sizeof(float), cudaMemcpyHostToDevice);

// Host invokes kernel function to add vectors on device

...

// Host copies result from device memory

cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);

// Host frees device memories

cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);

10
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
cudaMalloc(&d_in1, n * sizeof(float));
cudaMalloc(&d_in2, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));

// Host copies data to device memories

cudaMemcpy(d_in1, in1, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, n * sizeof(float), cudaMemcpyHostToDevice);
so thread trong 1 block can bay nhieu block

// Host invokes kernel function to add vectors on device

dim3 blockSize(256); // For simplicity, you can temporarily view blockSize as a number
dim3 gridSize((n - 1) / blockSize.x + 1); // Similarity, view gridSize as a number
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);
This command creates on device a bunch of threads (called grid) executing the
// Host copies result
addVecOnDevice from device
function memory
in parallel; these threads are organized into gridSize groups or
cudaMemcpy(out,
blocks, d_out, consists
each group/block n * sizeof(float), cudaMemcpyDeviceToHost);
of blockSize threads
Grid
// HostBlock
frees device memories
Block Block
cudaFree(d_in1);
cudaFree(d_in2); ...
cudaFree(d_out);
11
...
// Host invokes kernel function to add vectors on device
dim3 blockSize(256);
dim3 gridSize((n - 1) / blockSize.x + 1);
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);
...
Kernel functions must return “void”

global void addVecOnDevice(float* in1, float* in2, float* out, int n)

{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
out[i] = in1[i] + in2[i];
}

Data
index

Redundant 12
n=700
More on CUDA Function Declarations
Callable
from Execute on Execute by
__device__ float DeviceFunc() device device Caller host thread
New grid of
__global__ void KernelFunc() host device
device thread
Caller thread
__host__ float HostFunc() host host
device

• global define a kernel function

• A kernel function must return void
• __device__ and __host__ can be used together
• Generate two versions of object code for the same function
• __host__ is optional if use alone.
13
Compiling A CUDA Program
• Use NVCC (NVIDIA C compiler)

14
Kernel function execution is
asynchronous w.r.t host by default
After host calls a kernel function to be executed on device,
host will be free to do other works without waiting the
kernel to be completed

...
// Host invokes kernel function to add vectors on device
dim3 blockSize(256);
dim3 gridSize((n - 1) / blockSize.x + 1);
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);

// Host copies result from device memory

cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost); // OK?
OK, because the
cudaMemcpy function
forces host to wait until
the kernel finishes,
15
only then it starts to copy
Kernel function execution is
asynchronous w.r.t host by default
...
// Host invokes kernel function to add vectors on device
dim3 blockSize(256);
dim3 gridSize((n - 1) / blockSize.x + 1);
double start = seconds(); // seconds is my function to get the current time
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);
double time = seconds() - start; // OK?
…

16
Kernel function execution is
asynchronous w.r.t host by default
...
// Host invokes kernel function to add vectors on device
dim3 blockSize(256);
dim3 gridSize((n - 1) / blockSize.x + 1);
double start = seconds(); // seconds is my function to get the current time
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);
cudaDeviceSynchronize(); // Host waits here until device completes its work
double time = seconds() - start; // ✓
…

17
Error checking
when calling CUDA API functions
• It’s possible that an error happens but the CUDA program still run normally
and give wrong result
• → don’t know where to fix bug 
• → to know where to fix bug, we should always check error when calling
CUDA API functions
• For convenience, we can define a macro to check error and wrap it around
#define
CUDA APICHECK(call)
function calls \
{ \
cudaError_t err = call; \
if (err != cudaSuccess) \
{ \
printf("%s in %s at line %d!\n", cudaGetErrorString(err), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}

18
// Host allocates memories on device
float *d_in1, *d_in2, *d_out;
CHECK(cudaMalloc(&d_in1, n * sizeof(float)));
CHECK(cudaMalloc(&d_in2, n * sizeof(float)));
CHECK(cudaMalloc(&d_out, n * sizeof(float)));

// Host copies data to device memories

CHECK(cudaMemcpy(d_in1, in1, n * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_in2, in2, n * sizeof(float), cudaMemcpyHostToDevice));

// Host invokes kernel function to add vectors on device

dim3 blockSize(256);
dim3 gridSize((n - 1) / blockSize.x + 1);
addVecOnDevice<<<gridSize, blockSize>>>(d_in1, d_in2, d_out, n);

// Host copies result from device memory

CHECK(cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost));

// Host frees device memories

CHECK(cudaFree(d_in1));
CHECK(cudaFree(d_in2));
CHECK(cudaFree(d_out));
19
Error checking
when calling kernel functions?
Read here, “Handling CUDA Errors” section

20
Experiment: host vs device
• Generate input vectors with random values in [0, 1]
• Compare running time between host (addVecOnHost
function) and device (addVecOnDevice function, block size
512) with different vector sizes
• GPU: GeForce GTX 560 Ti (compute capability 2.1)

21
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64

22
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024

23
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256

24
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118

25
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118
1024

26
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118
1024 0.006 0.017 0.347

27
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118
1024 0.006 0.017 0.347
4096

28
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118
1024 0.006 0.017 0.347
4096 0.030 0.017 1.775

29
Experiment: host vs device

Vec size Host time (ms) Device time (ms) Host time / Device time
64 0.001 0.040 0.024
256 0.002 0.018 0.118
1024 0.006 0.017 0.347
4096 0.030 0.017 1.775
16384 0.127 0.017 7.403
65536 0.516 0.055 9.409
262144 1.028 0.197 5.220
1048576 3.773 0.277 13.619
4194304 13.870 0.617 22.479
16777216 55.177 1.993 27.683

30
Reference
• [1] Slides from Illinois-NVIDIA GPU Teaching Kit
• [2] Wen-Mei, W. Hwu, David B. Kirk, and Izzat El Hajj.
Programming Massively Parallel Processors: A Hands-on
Approach. Morgan Kaufmann, 2022

31
THE END

CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Introduction To CUDA C
No ratings yet
Introduction To CUDA C
67 pages
cuda
No ratings yet
cuda
4 pages
CUDA_part-1
No ratings yet
CUDA_part-1
52 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
Intro To CUDA
No ratings yet
Intro To CUDA
76 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
01 Cuda c Basics
No ratings yet
01 Cuda c Basics
32 pages
7. Moving to Parallel - Addition of 2 Matrices
No ratings yet
7. Moving to Parallel - Addition of 2 Matrices
14 pages
CUDAProgModel
No ratings yet
CUDAProgModel
24 pages
Cuda Talk
100% (1)
Cuda Talk
82 pages
21.L18 Intro To GPU and CUDA C
No ratings yet
21.L18 Intro To GPU and CUDA C
89 pages
CUDA_part-1-LMS
No ratings yet
CUDA_part-1-LMS
51 pages
CUDA Compute Unified Device Architecture
No ratings yet
CUDA Compute Unified Device Architecture
26 pages
Lecture3 Fundamentals of CUDA(Part1)_2025
No ratings yet
Lecture3 Fundamentals of CUDA(Part1)_2025
52 pages
Threads
No ratings yet
Threads
54 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
CUDA
No ratings yet
CUDA
33 pages
3-CUDA
No ratings yet
3-CUDA
5 pages
GPU Programming: CUDA
No ratings yet
GPU Programming: CUDA
29 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
2023 CSC14120 Lecture05 CUDAMemories
No ratings yet
2023 CSC14120 Lecture05 CUDAMemories
48 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
11 pages
CUDA Introduction
No ratings yet
CUDA Introduction
39 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Rishi
No ratings yet
Rishi
30 pages
Cuda Review 1
No ratings yet
Cuda Review 1
13 pages
cuda_mode_lecture2
No ratings yet
cuda_mode_lecture2
33 pages
CUDA Introduction Mod
No ratings yet
CUDA Introduction Mod
50 pages
CUDA_1
No ratings yet
CUDA_1
45 pages
Laboratory Practice I (410246)
No ratings yet
Laboratory Practice I (410246)
28 pages
Programming Gpus With Cuda: John Mellor-Crummey
No ratings yet
Programming Gpus With Cuda: John Mellor-Crummey
42 pages
HPC Final 4-8
No ratings yet
HPC Final 4-8
25 pages
GPUMod 2
No ratings yet
GPUMod 2
64 pages
3-computation
No ratings yet
3-computation
28 pages
Lec 2 PDC
No ratings yet
Lec 2 PDC
31 pages
002 - Introduction To CUDA Programming - 1
No ratings yet
002 - Introduction To CUDA Programming - 1
54 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
Introduction To Programming Massively Parallel Graphics Processors
No ratings yet
Introduction To Programming Massively Parallel Graphics Processors
84 pages
CUDA Programming Model
No ratings yet
CUDA Programming Model
14 pages
L06_GPGPU_CUDA_Programming_1
No ratings yet
L06_GPGPU_CUDA_Programming_1
23 pages
High Performance Computing On Gpu
No ratings yet
High Performance Computing On Gpu
37 pages
CUDA Putting It All Together
No ratings yet
CUDA Putting It All Together
39 pages
27th Aug - Introduction To GPGPU - Part 1
No ratings yet
27th Aug - Introduction To GPGPU - Part 1
32 pages
лк CUDA - 1 PDCn
No ratings yet
лк CUDA - 1 PDCn
31 pages
cs179 2016 Lec13
No ratings yet
cs179 2016 Lec13
30 pages
Class13
No ratings yet
Class13
19 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Understanding Software Engineering Vol 3: Programming Basic Software Functionalities.
From Everand
Understanding Software Engineering Vol 3: Programming Basic Software Functionalities.
Gabriel Clemente
No ratings yet
List of Programming Languages
No ratings yet
List of Programming Languages
11 pages
Employee Starter Guide Mac Basics
No ratings yet
Employee Starter Guide Mac Basics
24 pages
Mapper Rundesign Vol2
No ratings yet
Mapper Rundesign Vol2
455 pages
L01-slides-Programming For Performance
No ratings yet
L01-slides-Programming For Performance
53 pages
Reloj Hmi PLC
No ratings yet
Reloj Hmi PLC
73 pages
BERC Online Platform Proposal
No ratings yet
BERC Online Platform Proposal
4 pages
Neel Munjpara 02-2024.
No ratings yet
Neel Munjpara 02-2024.
1 page
Nucleo XXXXZX Nucleo XXXXZX P Nucleo XXXXZX Q
No ratings yet
Nucleo XXXXZX Nucleo XXXXZX P Nucleo XXXXZX Q
9 pages
ANN Prediction of Formation Bulk Density Presentation
No ratings yet
ANN Prediction of Formation Bulk Density Presentation
20 pages
Basic 5300
No ratings yet
Basic 5300
13 pages
Instructions - FAQ's - Onsite Coding Test
No ratings yet
Instructions - FAQ's - Onsite Coding Test
6 pages
Obstructive Jaundice: Laporan Kasus
No ratings yet
Obstructive Jaundice: Laporan Kasus
29 pages
Applied Mathematics Ll 1 1
No ratings yet
Applied Mathematics Ll 1 1
230 pages
Cameraman Job Description Resume
100% (1)
Cameraman Job Description Resume
5 pages
f37 Book Intarch Pres Pt5
No ratings yet
f37 Book Intarch Pres Pt5
75 pages
Cs3491 Aiml Unit 3 Qbank
No ratings yet
Cs3491 Aiml Unit 3 Qbank
50 pages
Autocad Leaflet
No ratings yet
Autocad Leaflet
2 pages
BCA-4-DECOA-Memory System-1
No ratings yet
BCA-4-DECOA-Memory System-1
11 pages
Basic Mathematics (0071)
No ratings yet
Basic Mathematics (0071)
4 pages
Datasheet - HK tms57002ph 190374 PDF
No ratings yet
Datasheet - HK tms57002ph 190374 PDF
3 pages
Auxilio Gratis
No ratings yet
Auxilio Gratis
3 pages
Chapter 4
No ratings yet
Chapter 4
54 pages
Simplex 4190 Truesite Workstation (TSW) Software Upgrade Instructions
No ratings yet
Simplex 4190 Truesite Workstation (TSW) Software Upgrade Instructions
24 pages
FlightGear Short Reference
100% (1)
FlightGear Short Reference
2 pages
Cisco Bug_ CSCui80835 - ASA drops packet as PAWS failure after incorrect TSecr is seen
No ratings yet
Cisco Bug_ CSCui80835 - ASA drops packet as PAWS failure after incorrect TSecr is seen
2 pages
[FREE PDF sample] Building Python Programs 1st Edition Stuart Reges ebooks
No ratings yet
[FREE PDF sample] Building Python Programs 1st Edition Stuart Reges ebooks
65 pages
Ti 275281169 SK Tu4 Eip C en 0623 Desk 1
No ratings yet
Ti 275281169 SK Tu4 Eip C en 0623 Desk 1
8 pages
Arithmetic Code Discussion and Implementation
No ratings yet
Arithmetic Code Discussion and Implementation
11 pages
Impact of Social Media Marketing On Consumer Buying Behavior
No ratings yet
Impact of Social Media Marketing On Consumer Buying Behavior
57 pages
Minimal Audio EULA
No ratings yet
Minimal Audio EULA
3 pages