Using CUDA
Using CUDA
Oswald Haan
ohaan@gwdg.de
Code Examples:
cp –r ~ohaan/cuda_kurs/* .
cp –r ~ohaan/cuda_kurs_f/* .
A first Example: Adding two Vectors y = a*x+y
void saxpy( int n, float a, float *x, __global__ void saxpy_d( int n, float a,
float *y ) float *x, float *y )
{ {
int i; int i = threadIdx.x;
for (i=0; i<n; i++) { if (i<n) {
y[i] = a*x[i] + y[i]; y[i] = a*x[i] + y[i];
} }
} }
Host code for saxpy routine Device code for saxpy_d kernel routine
A first Example: Adding two Vectors
int main( void ) { int main( void ) {
int N = 1024, i; int N = 1024, i;
float *x, *y; float *x, *y, *x_d, *y_d;
x = (float*)malloc(N*sizeof(float)); x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float)); y = (float*)malloc(N*sizeof(float));
cudaMalloc( &x_d, N*sizeof(float));
cudaMalloc( &y_d, N*sizeof(float));
Host code for calling sequential routine Host code for calling kernel routine
code in
~ohaan/cuda_kurs/saxpy.cu
Managing Memory on Host and on Device
a = (float*)malloc(3*sizeof(float));
Allocates memory for three floats at address a in host memory
cudaMalloc( &a_d, 3*sizeof(float) );
Allocates memory for three floats at address a_d in device memory
Stores this address at address &a_d in host memory
a_d a_d[0] a_d[1] a_d[2]
&a_d a_d a_d+1 a_d+2
a a+1 a+2
value in memory cell
address of memory cell
Host code
code in
~ohaan/cuda_kurs/saxpy_um.cu
Unified Memory with Static Allocation
const int N=1024;
__device__ __managed__ float x[N], y[N];
__global__
void saxpy_d( float a ) {
y[threadIdx.x] = a*x[threadIdx.x] + y[threadIdx.x];
}
int main(void) {
int i;
saxpy_d<<<1,N>>>(2.0f);
cudaDeviceSynchronize();
code in
~ohaan/cuda_kurs/saxpy_um_static.cu
Compiling CUDA codes
• CUDA source files must have the extension .cu
• Compiler nvcc is provided in the CUDA toolkit
• CUDA toolkit is available on GWDG‘s cluster frontends
gwdu101, gwdu102
by loading the CUDA toolkit module:
module load cuda
or sourcing the prepared command file
. module.x
• Compiling CUDA source file saxpy.cu with
nvcc –arch=sm_<xx> saxpy.cu –o saxpy
produces executable saxpy,
where xx is the compute capability of the GPU to be used,
xx = 52 is the default if the option –arch is not set.
With –arch=all code for all possible compute capabilities will be generated
Execution environment for CUDA executables
• GWDG’s compute cluster is operated by the workload manager Slurm, which
provides commands for allocating resources, for submitting jobs and for enquiring the
status of the cluster and of jobs
• All nodes with GPU devices belong to the Slurm partition gpu
• The Slurm sinfo command provides a list of nodes with types and numbers of gpus:
> sinfo -p gpu --format=%N,%G
NODELIST,GRES
dge[008-015],gpu:gtx980:4
dge[001-007],gpu:gtx1080:2
dte[001-010],gpu:k40:2
agt[001-002],gpu:v100:8
agq[001-012],gpu:rtx5000:4
General Slurm options
-p|--partition=gpu allocates resources providing gpus
--reservation=gpu-course allocates resources reserved for this course
-N,--nodes=<n> allocates <n> nodes
-n,–-ntasks=<n> allocates resources for starting <n> tasks
--tasks-per-node=<n> allocates resources for starting <n> tasks per node.
(If used with -n , it denotes the maximum number of tasks per node)
-t|--time= <hh:mm:ss> maximum runtime.
(After this time the job is killed)
-o|--output= <filename> store job output in file <filename>
(omitting this option, output is stored in slurm-%J.out, where %J is the jobid)
Slurm Options for Allocating GPUs
-G|--gpus=<n> requests <n> gpus of any kind
--gpus-per-node=<n> requests <n> gpus of any kind per node
Particular types of GPUs can be requested by replacing the <n> in the two options by
<type:n>
The available types on the GWDG Scientific Compute Cluster are currently:
gtx1080, gtx980, k40, v100, rtx5000
int main() {
char name[1024];
int nDevices;
cudaGetDeviceCount(&nDevices);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
gethostname(name, 1024);
printf("Node name:%s, gpus: %i, Device name:%s, Comp.Cap.:%i.%i\n", name,
nDevices, prop.name, prop.major,prop.minor);
return(0);
} See
} ~ohaan/cuda_kurs/enquire_gpu.cu
Allocation Examples
gwdu103 > nvcc -o enquire.exe enquire_gpu.cu
gwdu103 > salloc -p gpu -N 1 -G rtx5000:3
...
bash-4.2$ srun ./enquire.exe
Node name:agq007, gpus: 3, Device name:Quadro RTX 5000, Comp.Cap.:7.5
Host code for saxpy routine Device code for saxpy_d kernel routine
The first two actual arguments in the calling sequence of the kernel
have not been declared in the host to reside in device memory,
therefore they have to be passed by value, not by reference
CUDA Fortran: Adding two Vectors
program call_saxpy program call_saxpy
use kernel use kernel
implicit none use cudafor
integer, parameter :: N = 1024 implicit none
integer :: i integer, parameter :: N = 1024
real :: x(N), y(N) integer :: i
real :: x(N), y(N)
real, device :: x_d(N), y_d(N)
do i = 1 , N do i = 1 , N
x(i) = 1.0 x(i) = 1.0
y(i) = 2.0 y(i) = 2.0
end do end do
x_d = x
y_d = y
Host code for calling sequential routine Host code for calling kernel routine
code in
~ohaan/cuda_kurs_f/saxpy.cuf
CUDA Fortran: using the kernel directive
program call_saxpy
use kernel
use cudafor
implicit none
integer, parameter :: N = 1024
integer :: i
real :: x(N), y(N)
real, device :: x_d(N), y_d(N)
do i = 1 , N
x(i) = 1.0
y(i) = 2.0
end do
x_d = x
y_d = y
code in
~ohaan/cuda_kurs_f/saxpy_dir.cuf
CUDA Fortran Unified Memory
program call_saxpy
use kernel
use cudafor
implicit none
integer, parameter :: N = 1024
integer :: i, istat
real, managed :: x(N), y(N)
real :: maxerr
do i = 1 , N
x(i) = 1.0
y(i) = 2.0
end do
maxerr=0.0
do i = 1 , N
maxerr = max(maxerr,abs(y(i)-4.0))
end do
write(6,*)' maxerr = ',maxerr
int main() {
int nDevices; cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
...
complete code for enquiring in
} ~ohaan/cuda_kurs/device_properties.cu
Output from program device_properties.cu
Device Number: 0 Device Number: 1
Device name: GeForce GTX 980 Device name: GeForce GTX 980
Device capability major revision number: 5 Device capability major revision number: 5
Device capability minor revision number: 2 Device capability minor revision number: 2
Clock Rate (KHz): 1240500 Clock Rate (KHz): 1240500
total Global Memory (byte): 4294770688 total Global Memory (byte): 4294770688
Shared Memory per Block (byte): 49152 Shared Memory per Block (byte): 49152
total Constant Memory (byte): 65536 total Constant Memory (byte): 65536
size of L2 cache (byte): 2097152 size of L2 cache (byte): 2097152
32-bit Registers per Block: 65536 32-bit Registers per Block: 65536
max. Threads per Block: 1024 max. Threads per Block: 1024
number of Threads in Warp: 32 number of Threads in Warp: 32
number of Multiprocessors: 16 number of Multiprocessors: 16
Memory Clock Rate (KHz): 3505000 Memory Clock Rate (KHz): 3505000
Max Grid Size: 2147483647 65535 65535 Max Grid Size: 2147483647 65535 65535
Max Block Size: 1024 1024 64 Max Block Size: 1024 1024 64
Memory Bus Width (bits): 256 Memory Bus Width (bits): 256
Peak Memory Bandwidth (GB/s): 224.320000 Peak Memory Bandwidth (GB/s): 224.320000
CUDA Fortran: Enquiring Device Properties
integer :: i, istat, nDevices
type (cudaDeviceProp) :: prop
istat = cudaGetDeviceCount(nDevices)
do i = 0, nDevices-1
istat = cudaGetDeviceProperties(prop, i)
write(6,*) 'Device Number: ', i
write(6,*) 'Device name: ', prop%name
...
end do
Device Number: 0
Device Name: GeForce GTX 1080
Device Revision Number: 6.1 Texture Alignment: 512B
Global Memory Size: 8508145664 Clock Rate: 1733 MHz
Number of Multiprocessors: 20 Execution Timeout: No
Concurrent Copy and Execution: Yes Integrated Device: No
Total Constant Memory: 65536 Can Map Host Memory: Yes
Total Shared Memory per Block: 49152 Compute Mode: default
Registers per Block: 65536 Concurrent Kernels: Yes
Warp Size: 32 ECC Enabled: No
Maximum Threads per Block: 1024 Memory Clock Rate: 5005 MHz
Maximum Block Dimensions: 1024, 1024, 64 Memory Bus Width: 256 bits
Maximum Grid Dimensions: 2147483647 x 65535 x 65535 L2 Cache Size: 2097152 bytes
Maximum Memory Pitch: 2147483647B Max Threads Per SMP: 2048
Async Engines: 2
Unified Addressing: Yes
Managed Memory: Yes
PGI Compiler Option: -ta=tesla:cc60
GPU-Properties of different nodes
GWDG NVIDIA Graphics Comp. Clock Device Band Number CUDA CUDA Perf. Ratio
node Modell Chip Capab. rate memory width of SMXes cores cores FP64/
[MHz] [GB] [GB/s] per per FP32
SMX SMX
(SP) (DP)
dte001- 2 Tesla K40 GK110 3.5 745 12 288 15 192 64 1:3
dte015 (Kepler)
• Selection of resources:
-G,--gpus= <type>:<n>
possible value for type :
gtx1080, gtx980, k40, v100, rtx5000
How to Use 2 GPUs simultaneously
• Device can be selected with cudaSetDevice(device_number)
• Prepare two executables: exe0 including cudaSetDevice(0)
exe1 including cudaSetDevice(1)
#!/bin/bash
#SBATCH -p gpu
#SBATCH -W 1:00
#SBATCH –N 1
#SBATCH –gpus-per-node=2 selects a node with 2 GPUs
macro code in
~ohaan/cuda_kurs/errchk.ut
Exercise: Large Vectors
• Run saxpy.cu with
N > 1024
int main(void) {
int N = 8024, N_thrpb, N_blks, i;
float *x, *y;
complete code in
~ohaan/cuda_kurs/saxpy_large.cu
3 dim Grids and Blocks
• can be configured with the CUDA type dim3:
dim3 gdims(gdim_x,gdim_y,gdim_z);
dim3 bdims(bdim_x,bdim_y,bdim_z);
kernel <<<gdims,bdims>>> (...);
• This will launch a total number of
gdim_x*gdim_y*gdim_z*bdim_x*bdim_y*bdim_z
threads on the device
• At most (number of SMXes)*2048 threads will be executing at any
time
Vector Addition with 3-dim Grids and Blocks
dim3 block(5,5);
saxpy_2d_d<<<1,block>>>(n, m, 2.0f, x, y);
cudaDeviceSynchronize(); cudaCheckError();
index = i*m + j;
device code
m columns
j = 0,…,m-1
threadIdx.y = 0,..., blockDim.y-1
device code
j = 0,…,m-1
i = 0,…,n-1 n rows
m columns
#include <sys/time.h>
double get_el_time(){
struct timeval et;
gettimeofday ( &et,NULL);
return (double)et.tv_sec
+1.e+6*(double)et.tv_usec;
} code for get_ell_time in
~ohaan/cuda_kurs/time.ut
Timing of CUDA Code with CUDA Events
• Read internal clock before and after a code segment in host code
SMX n
SMX 1
Shared 32 bit registers
CPU 1
...
L2 cache
cache
...
...
...
...
Main memory Main memory
Shared memory / L1 cache
local mem
local mem
local mem
local mem
thread
thread
thread
thread
. . . . . . . . .
block block
global mem
constant mem
texture mem
Types of Kernel Variables: Local
• Variables (scalars and arrays) defined in the scope of a kernel are local
__global__ void ker1(int szaloc2,..){
int iloc1, iloc2;
float aloc1[6], *aloc2;
aloc2 = (float *)malloc(sizeof(float)*szaloc2);
...
}
• Each thread has its own set of local variables, which are placed in the
register files of the SMXes, or in global memory, if there are not enough
registers or if the variable is an indexed array
• Number of 32 bit registers per SMX 216=65536
• Maximal number of registers per thread 255
• For the maximal number of 2048 threads per SMX
number of registers per thread 32
Types of Kernel Variables: Global
• global variables (scalars and arrays) defined in the scope of the
application, reside in device main memory and are shared by all threads
in the kernels
• If allocated from host by calling cudaMalloc()
they can be accessed from host by cudaMemcpy(...)
• If allocated from host by calling cudaMallocManaged()
they can be accessed from host and from device with the same address
(unified memory)
• accessing the same global variable from a kernel by different threads is
not deterministic, since the order of execution for different blocks of
threads is not prescribed
Accessing Global Variables
• Device memory is accessed by load/store operations for aligned
memory segments of size 32, 64, or 128 Bytes
• If the 32 threads of a warp access 32 int or float variables lying
consecutively in memory, 4 load/store operations of 32 Byte segments
serve all 32 accesses (coalescent access)
• Compare the performance of 2-dim array addition on gtx1080
Measured bandwidth
Bandwidth [GB/s]
Using different blocksizes
Measured bandwidth
Bandwidth [GB/s]
Using different blocksizes
Thread 0 Thread 1
Read r1 from a r1 = 0
r2 = r1 + b[0] r2 = b[0]
write r2 to a a = b[0]
Read r1 from a r1 = b[0]
r2 = r1 + b[1] r2 = b[0]+b[1]
write r2 to a a = b[0]+b[1]
Atomic Operations
atomicAdd : atomic for all CUDA threads in the current program
executing in the same compute device as the current thread.
atomicAdd_block : atomic for all CUDA threads in the current
program executing in the same thread block as the current thread
atomicAdd_system : atomic for all threads in the current program
including other CPUs and GPUs in the system