Week 6 10

Mahaveer R
210968246
Batch 4
Parallel Programming Lab
04 April 2024.
Week 6 – 16 February 2024.

1) Write a simple MPI program to find out pow (x, rank) for all the processes where 'x' is the integer constant,
and 'rank' is the rank of the process.
#include <stdio.h>
#include <math.h>
#include <mpi.h>
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// Assuming x is a constant value
int x = 40;
double result = pow(x, world_rank);
printf("Process %d: pow(%d, %d) = %f\n", world_rank, x, world_rank, result);
MPI_Finalize();
return 0;
}
Output:
Process 2: pow(40, 2) = 1600.000000
Process 1: pow(40, 1) = 40.000000
Process 0: pow(40, 0) = 1.000000
Process 4: pow(40, 4) = 2560000.000000
Process 3: pow(40, 3) = 64000.000000
2) Write a program in MPI where even ranked process prints "Hello" and odd ranked process prints "World".
#include <stdio.h>
#include <mpi.h>
int main(int argc, char* argv[]) {
if (world_rank % 2 == 0) {
printf(" EVEN Hello from process %d\n", world_rank);
}
else {
printf("ODD World from process %d\n", world_rank);
}
MPI_Finalize();
return 0;
}
Output:
Odd World from process 13
Even Hello from process 2
3) Write a program in MPI to simulate simple calculator. Perform each operation using different process in
parallel.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
char operations[] = { '+', '-', '*', '/' };
char operation = operations[world_rank % 4];
double operand1 = 10.0, operand2 = 2.0;
double result = 0.0;
switch (operation) {
case '+':
result = operand1 + operand2;
break;
case '-':
result = operand1 - operand2;
break;
case '*':
result = operand1 * operand2;
break;
case '/':
if (operand2 != 0) {
result = operand1 / operand2;
}
else {
fprintf(stderr, "Error: Division by zero.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
break;
default:
fprintf(stderr, "Error: Unknown operation.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// Allocate memory dynamically for gathered_results
double* gathered_results = (double*)malloc(world_size * sizeof(double));
MPI_Gather(&result, 1, MPI_DOUBLE, gathered_results, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (world_rank == 0) {
printf("Results:\n");
for (int i = 0; i < world_size; ++i) {
printf("Process %d: %.2f %c %.2f = %.2f\n", i, operand1, operations[i % 4], operand2, gathered_results[i]);
}
}
// Free dynamically allocated memory
free(gathered_results);
MPI_Finalize();
return 0;
}
Output:
Process 0: 10.00 + 2.00 = 12.00
Process 1: 10.00 - 2.00 = 8.00
Process 2: 10.00 * 2.00 = 20.00
Process 3: 10.00 / 2.00 = 5.00
Process 4: 10.00 + 2.00 = 12.00
Process 5: 10.00 - 2.00 = 8.00
Process 6: 10.00 * 2.00 = 20.00
Process 7: 10.00 / 2.00 = 5.00
Process 8: 10.00 + 2.00 = 12.00
Process 9: 10.00 - 2.00 = 8.00
Process 10: 10.00 * 2.00 = 20.00
Process 11: 10.00 / 2.00 = 5.00
Process 12: 10.00 + 2.00 = 12.00
Process 13: 10.00 - 2.00 = 8.00
4) Write a program in MPI to toggle the character of a given string indexed by the rank of the process.
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#define MAX_STRING_SIZE 100
int main(int argc, char *argv[]) {
int rank, size;
char str[MAX_STRING_SIZE] = "HeLLO";
int str_len;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
str_len = strlen(str);
// Determine the character index to toggle based on process rank
int char_index = rank % str_len;
// Toggle the character
if (rank < str_len) {
if (str[char_index] >= 'a' && str[char_index] <= 'z') {
str[char_index] = str[char_index] - 32; // Convert to uppercase
} else if (str[char_index] >= 'A' && str[char_index] <= 'Z') {
str[char_index] = str[char_index] + 32; // Convert to lowercase
}
}
// Gather all modified strings to process 0
MPI_Gather(rank < str_len ? &str[char_index] : NULL, 1, MPI_CHAR,
str, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
// Print the result in process 0
if (rank == 0) {
printf("Original String: HeLLO\n");
printf("Modified String: %s\n", str);
}
MPI_Finalize();
return 0;
}
Output:
Original String: ThisIsALongerString
Modified String: tHiSiSaLoNgErStRiNg
5) Write a program in MPI to reverse the digits of the following integer array of size 9 with 9 processes.
Initialize the Input array to the following values.
Input array : 18, 523, 301, 1234, 2, 14, 108, 150, 1928 Output array: 81, 325, 103, 4321, 2, 41, 801, 51, 8291
#include <stdio.h>
#include <mpi.h>
#define ARRAY_SIZE 9
int reverseDigits(int num) {
int reversed = 0;
while (num > 0) {
reversed = reversed * 10 + num % 10;
num /= 10;
}
return reversed;
}
int rank, size;
if (size != ARRAY_SIZE) {
if (rank == 0) {
fprintf(stderr, "Please run the program with exactly 9 processes.\n");
}
MPI_Finalize();
return 1;
}
int inputArray[ARRAY_SIZE] = { 18, 523, 301, 1234, 2, 14, 108, 150, 1928 };
int outputArray[ARRAY_SIZE];
int gatheredArray[ARRAY_SIZE];
// Scatter the input array among processes
MPI_Scatter(inputArray, 1, MPI_INT, &outputArray[rank], 1, MPI_INT, 0, MPI_COMM_WORLD);
// Reverse the digits
outputArray[rank] = reverseDigits(outputArray[rank]);
// Gather the reversed array using a separate buffer

MPI_Gather(&outputArray[rank], 1, MPI_INT, gatheredArray, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Print the result
if (rank == 0) {
printf("Input array : ");
for (int i = 0; i < ARRAY_SIZE; i++) {
printf("%d ", inputArray[i]);
}
printf("\nOutput array: ");
for (int i = 0; i < ARRAY_SIZE; i++) {
printf("%d ", gatheredArray[i]);
}
printf("\n");
}
MPI_Finalize();
return 0;
}
Output:
Input array : 18 523 301 1234 2 14 108 150 1928
Output array: 81 325 103 4321 2 41 801 51 8291
6) Write a MPI program to find the prime numbers between 1 and 100 using 2 processes.
#include <stdio.h>
#include <stdbool.h>
#include <mpi.h>
#define RANGE_START 1
#define RANGE_END 100
bool isPrime(int num) {
if (num < 2) {
return false;
}
for (int i = 2; i * i <= num; i++) {
if (num % i == 0) {
return false;
}
}
return true;
}
int rank, size;
if (size != 2) {
if (rank == 0) {
fprintf(stderr, "Please run the program with exactly 2 processes.\n");
}
MPI_Finalize();
return 1;
}
int start, end;
int primesInRange = 0;
if (rank == 0) {
// Process 0 will check for primes in the first half of the range
start = RANGE_START;
end = RANGE_END / 2;
}
else {
// Process 1 will check for primes in the second half of the range
start = RANGE_END / 2 + 1;
end = RANGE_END;
}
// Each process checks for prime numbers in its assigned range
for (int num = start; num <= end; num++) {
if (isPrime(num)) {
primesInRange++;
}
}
// Process 0 receives the count of primes from Process 1 and adds them
if (rank == 0) {
int receivedPrimes;
MPI_Recv(&receivedPrimes, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
primesInRange += receivedPrimes;
printf("Prime numbers between %d and %d: %d\n", RANGE_START, RANGE_END, primesInRange);
}
else {
// Process 1 sends its count of primes to Process 0
MPI_Send(&primesInRange, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Output:
Prime numbers between 1 and 100: 25
Week 7 – 23 February 2024.

1) Write a MPI program using synchronous send. The sender process sends a word to the receiver. The second
process receives the word, toggles each letter of the word and sends it back to the first process. Both processes
use synchronous send operations.
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <mpi.h>
void toggle_case(char* word) {
for (int i = 0; word[i] != '\0'; i++) {
if (isupper(word[i]))
word[i] = tolower(word[i]);
else if (islower(word[i]))
word[i] = toupper(word[i]);
}
}
int rank;
const int tag = 0;
const int root = 0;
if (rank == root) {
// Sender process
const char* word = "Hello";
printf("Process %d sends word: %s\n", rank, word);
MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
// Receive the toggled word back from process 1
char received_word[100];
MPI_Recv(received_word, 100, MPI_CHAR, 1, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process %d received toggled word: %s\n", rank, received_word);
} else if (rank == 1) {
// Receiver process
char word[100];
MPI_Recv(word, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process %d received word: %s\n", rank, word);
// Toggle the case of each letter in the word
toggle_case(word);
// Send it back to the sender process

MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Output:
Process 1 received word: Hello
Process 0 sends word: Hello
Process 0 received toggled word: hELLO
2) Write a MPI program where the master process (process 0) sends a number to each of the slaves and the
slave processes receive the number and prints it. Use standard send.
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
int world_size;
int world_rank;
int number;
// Master process
// Choose a number to send to all slave processes
number = 777; // You can change this number to any number you want to send
// Use MPI_Send to send it to all the other processes
for (int i = 1; i < world_size; i++) {
MPI_Send(&number, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
printf("Master process sending number %d to process %d\n", number, i);
}
} else {
// Slave processes
MPI_Recv(&number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Slave process %d received number %d from master process\n", world_rank, number);
}
MPI_Finalize();
return 0;
}
Output:
Slave process 3 received number 123 from master process
Master process sending number 123 to process 1
3) Write a MPI program to read N elements of the array in the root process (process 0) where N is equal to the total
number of process. The root process sends one value to each of the slaves. Let even ranked process finds square of the
received element and odd ranked process finds cube of received element. Use Buffered send.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int world_size;
int world_rank;
// The root process (process 0) will read N elements where N is equal to world_size
int *numbers = NULL;
numbers = (int*)malloc(sizeof(int) * world_size);
// Initialize the array with some values
numbers[i] = i + 1; // Or any other logic to initialize the array
}
}
// Buffer for buffered send
int buffer_size = world_size * sizeof(int) + MPI_BSEND_OVERHEAD;
void *buffer = malloc(buffer_size);
MPI_Buffer_attach(buffer, buffer_size);
// Distribute one number to each slave process
MPI_Bsend(&numbers[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
}
}
// Each slave process receives a number and performs its operation
int received_number;
if (world_rank != 0) {
MPI_Recv(&received_number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
if (world_rank % 2 == 0) {
// Even rank: square the number
printf("Process %d received %d, squaring it to %d\n", world_rank, received_number, received_number *
received_number);
} else {
// Odd rank: cube the number
printf("Process %d received %d, cubing it to %d\n", world_rank, received_number, received_number *
received_number * received_number);
}
}
// Detach and free the buffer
MPI_Buffer_detach(&buffer, &buffer_size);
free(buffer);
// Free numbers array on root process
if(world_rank == 0) {
free(numbers);
}
MPI_Finalize();
return 0;
}
Output:
Process 1 received 2, cubing it to 8
Process 3 received 4, cubing it to 64
Process 2 received 3, squaring it to 9
4) Write a MPI program to read an integer value in the root process. Root process sends this value to Process1,
Process1 sends this value to Process2 and so on. Last process sends the value back to root process. When
sending the value each process will first increment the received value by one. Write the program using point to
point communication routines.
#include <stdio.h>
#include <mpi.h>
int value;
int world_rank, world_size;
// Root process reads the integer value
value = 10; // Example value, can be read from user input
printf("Root process starts with value: %d\n", value);
MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);
} else {
MPI_Recv(&value, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
value++; // Increment the value by one
printf("Process %d incremented value to: %d\n", world_rank, value);
if (world_rank < world_size - 1) {
MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);
} else {
// Last process sends it back to the root process
MPI_Send(&value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
// Root process receives the value from the last process
MPI_Recv(&value, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Root process received final value: %d\n", value);
}
MPI_Finalize();
return 0;
}
Output:
Process 1 incremented value to: 11
Root process starts with value: 10
Root process received final value: 13
5) Write a MPI program to read N elements of an array in the master process. Let N processes including master
process check the array values are prime or not.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
// Function to check if a number is prime
int is_prime(int number) {
if (number <= 1) return 0;
for (int i = 2; i * i <= number; i++) {
if (number % i == 0) return 0;
}
return 1;
}
int world_rank, world_size, number_to_check;
int* array = NULL; // Initialize the pointer to NULL
array = (int*)malloc(sizeof(int) * world_size); // Allocate memory only on the root process
array[i] = i + 2; // Example values, start from 2 (first prime number)
}
}
// Use MPI_Scatter to distribute the values
MPI_Scatter(array, 1, MPI_INT, &number_to_check, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Each process checks if the number received is prime
int result = is_prime(number_to_check);
printf("Process %d received %d, prime? %s\n", world_rank, number_to_check, result ? "Yes" : "No");
// Free the allocated memory on the root process
free(array);
}
MPI_Finalize();
return 0;
}
Output:
Process 1 received 3, prime? Yes
Process 2 received 4, prime? No
Week 8 – 1 March 2024.

1. Write a program in CUDA to add two vectors of length N using
a) block size as N b) N threads
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
// CUDA kernel to add elements of two arrays
__global__ void add(int* x, int* y, int* z, int n) {
int index = threadIdx.x;
if (index < n)
z[index] = x[index] + y[index];
}
// Function to print the vector
void printVector(int* vector, int size) {
for (int i = 0; i < size; i++) {
printf("%d ", vector[i]);
}
printf("\n");
}
int main() {
int N = 1024; // Example size, make sure it does not exceed your GPU's capability
int* x, * y, * z, * d_x, * d_y, * d_z;
// Allocate memory
x = (int*)malloc(N * sizeof(int));
y = (int*)malloc(N * sizeof(int));
z = (int*)malloc(N * sizeof(int));
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
// Initialize arrays
for (int i = 0; i < N; i++) {
x[i] = 5;
y[i] = 2;
}
// Copy inputs to device
cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add << <1, N >> > (d_x, d_y, d_z, N);
// Copy result back to host
cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result
printf("Result: ");
printVector(z, N);
// Cleanup
cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
free(x); free(y); free(z);
return 0;
}
Output:
Result: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777
#include <stdio.h>
#include <stdio.h>
#include <cstdlib>
__global__ void add(int* x, int* y, int* z, int n) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n)
z[index] = x[index] + y[index];
}
void printVector(int* vector, int size) {
printf("%d ", vector[i]);
}
printf("\n");
}
int main() {
int N = 2048; // Example size
int* x, * y, * z, * d_x, * d_y, * d_z;
x = (int*)malloc(N * sizeof(int));
y = (int*)malloc(N * sizeof(int));
z = (int*)malloc(N * sizeof(int));
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
for (int i = 0; i < N; i++) {
x[i] = 5;
y[i] = 2;
}
cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
int blockSize = 256; // Choose appropriate block size
int numBlocks = (N + blockSize - 1) / blockSize;
add << <numBlocks, blockSize >> > (d_x, d_y, d_z, N);
cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);
printf("Result: ");
printVector(z, N);
cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
free(x); free(y); free(z);
return 0;
}
Output:
Results: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777777777777777777777777777777777777777777777777777777777777
777777
2) Implement a CUDA program to add two vectors of length N by keeping the number of threads per block as
256 (constant) and vary the number of blocks to handle N elements.
#include <stdio.h>
#include <stdio.h>
#include <cstdlib>
// CUDA kernel to add elements of two arrays
__global__ void add(int* a, int* b, int* c, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < N)
c[index] = a[index] + b[index];
}
int main() {
int N = 1 << 20; // Example: Number of elements in each vector (1 Million elements)
int size = N * sizeof(int);
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; // Calculate needed blocks
// Allocate memory on the host
int* h_a, * h_b, * h_c;
h_a = (int*)malloc(size);
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
// Initialize vectors on the host
for (int i = 0; i < N; i++) {
h_a[i] = i;
h_b[i] = i;
}
// Allocate vectors in device memory
int* d_a, * d_b, * d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Invoke kernel
add << <blocksPerGrid, threadsPerBlock >> > (d_a, d_b, d_c, N);
// Copy result from device memory to host memory
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Print the results: Print the first 10 elements
printf("Result of Vector Addition (First 10 Elements):\n");
for (int i = 0; i < 10; i++) {
printf("h_c[%d] = %d\n", i, h_c[i]);
}
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Free host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Output:
Result of Vector Addition (First 10 Elements):
h_c[0] = 0
h_c[1] = 2
h_c[2] = 4
h_c[3] = 6
h_c[4] = 8
h_c[5] = 10
h_c[6] = 12
h_c[7] = 14
h_c[8] = 16
h_c[9] = 18
3) Write a program in CUDA which performs convolution operation on one dimensional input array N of size width using
a mask array M of size mask_width to produce the resultant one-dimensional array P of size width.
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <cstdlib>
// CUDA Kernel for one-dimensional convolution
__global__ void convolution_1d(int* N, int* M, int* P, int width, int mask_width) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k;
// Each thread computes one element of P
if (i < width) {
int pValue = 0;
// Convolution operation
for (k = 0; k < mask_width; ++k) {
int maskIndex = mask_width - 1 - k;
int nIndex = i - (mask_width / 2) + k;
if (nIndex >= 0 && nIndex < width) {
pValue += N[nIndex] * M[maskIndex];
}
}
P[i] = pValue;
}
}
// Function to print the array
void printArray(int* array, int size) {
printf("%d ", array[i]);
}
printf("\n");
int main() {
int width = 10; // Example array size
int mask_width = 3; // Example mask size
int N[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; // Example input array
int M[3] = { 1, 0, -1 }; // Example mask array
int P[10]; // Resultant array
int* d_N, * d_M, * d_P;
// Allocate memory on the device
cudaMalloc(&d_N, width * sizeof(int));
cudaMalloc(&d_M, mask_width * sizeof(int));
cudaMalloc(&d_P, width * sizeof(int));
cudaMemcpy(d_N, N, width * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_M, M, mask_width * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel on the GPU
convolution_1d << <(width + 255) / 256, 256 >> > (d_N, d_M, d_P, width, mask_width);
cudaMemcpy(P, d_P, width * sizeof(int), cudaMemcpyDeviceToHost);
// Print the resultant array
printf("Resultant array: ");
printArray(P, width);
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
}
Output:
Resultant array: 2 2 2 2 2 2 2 2 2 -9
4) Write a program in CUDA to process a ID array containing angles in radians to generate sine of the angles in
the output array. Use appropriate function.
#include <stdio.h>
#include <math.h>
#include <stdio.h>
#include <cstdlib>
// Define M_PI if it's not defined by the math library
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// CUDA Kernel to compute sine of angles in radians
__global__ void compute_sine(float* input, float* output, int n) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
output[index] = sinf(input[index]); // Use sinf for single precision float
}
}
// Function to print the array
void printArray(float* array, int size) {
printf("%f ", array[i]);
}
printf("\n");
}
int main() {
int n = 5; // Example array size
float inputAngles[] = { 0.0, M_PI / 6, M_PI / 4, M_PI / 2, M_PI }; // Example angles in radians
// Allocate memory on the device
float* d_input, * d_output;
cudaMalloc((void**)&d_input, n * sizeof(float));
cudaMalloc((void**)&d_output, n * sizeof(float));
// Copy the input array from host to device
cudaMemcpy(d_input, inputAngles, n * sizeof(float), cudaMemcpyHostToDevice);
// Calculate the number of blocks and threads per block
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
// Allocate memory for the output array on the host
float* output = new float[n]; // Use dynamic allocation
// Launch the CUDA Kernel
compute_sine << <blocksPerGrid, threadsPerBlock >> > (d_input, d_output, n);
// Copy the result back to the host
cudaMemcpy(output, d_output, n * sizeof(float), cudaMemcpyDeviceToHost);
// Print the resultant array
printf("Sine of angles: ");
printArray(output, n);
cudaFree(d_input);
cudaFree(d_output);
// Free host memory
delete[] output;
return 0;
}
Output:
Sine of angles: 0.000000 0.500000 0.707107 1.000000 -0.000000
Week 9 – 8 March 2024.

1) Write a program in CUDA to count the number of times a given word is repeated in a sentence (Use Atomic
Function).
#include <stdio.h>
#include <stdio.h>
#include <cstdlib>
#include <string.h>
#include <sm_20_atomic_functions.h>
__global__ void countWordOccurrences(char* sentence, char* word, int sentenceLength, int wordLength, int* count)
{
int stride = blockDim.x * gridDim.x;
int localCount = 0;
// Each thread checks a part of the sentence for the word
for (int i = index; i <= sentenceLength - wordLength; i += stride) {
bool wordFound = true;
for (int j = 0; j < wordLength; j++) {
if (sentence[i + j] != word[j]) {
wordFound = false;
break;
}
}
if (wordFound) {
localCount++;
}
}
// Use atomicAdd to safely add the local count to the global total
atomicAdd(count, localCount);
}
int main() {
char* sentence = "hello hello world world world world"; // Example sentence
char* word = "world"; // Word to count in the sentence
int* count;
char* d_sentence, * d_word;
int* d_count;
int sentenceLength = strlen(sentence);
int wordLength = strlen(word);
// Allocate memory for device copies of sentence, word, count
cudaMalloc((void**)&d_sentence, sentenceLength);
cudaMalloc((void**)&d_word, wordLength);
cudaMalloc((void**)&d_count, sizeof(int));
cudaMemcpy(d_sentence, sentence, sentenceLength, cudaMemcpyHostToDevice);
cudaMemcpy(d_word, word, wordLength, cudaMemcpyHostToDevice);
cudaMemset(d_count, 0, sizeof(int));
// Launch countWordOccurrences() kernel on GPU with enough blocks and threads
countWordOccurrences << <1, 256 >> > (d_sentence, d_word, sentenceLength, wordLength, d_count);
cudaMemcpy(count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
printf("The word '%s' appears %d times in the sentence.\n", word, *count);
// Cleanup
cudaFree(d_sentence); cudaFree(d_word); cudaFree(d_count);
return 0;
}
Output:
The word ‘world’ appears 4 times in the sentence.
2) Write a CUDA program that reads a string S and produces the string RS as follows:
Input string S: PCAP Output string RS: PCAPPCAPCP Note: Each work item copies required number of
characters from S in RS.
#include <stdio.h>
__global__ void generateString(char* inputString, char* outputString, int length) {

for (int i = index; i < length; i += stride) {
outputString[i] = inputString[i % 4]; // Modulo operation to repeat the string
}
}
int main() {
char inputString[] = "PCAP"; // Example input string
int length = 12; // Length of the output string (4 times the length of inputString)
char* d_inputString, * d_outputString;
// Allocate memory for device copies of inputString and outputString
cudaMalloc((void**)&d_inputString, strlen(inputString));
cudaMalloc((void**)&d_outputString, length);
// Copy inputString to device
cudaMemcpy(d_inputString, inputString, strlen(inputString), cudaMemcpyHostToDevice);
// Launch generateString kernel on GPU with enough blocks and threads
generateString << <1, 256 >> > (d_inputString, d_outputString, length);
char* outputString = (char*)malloc(length);
cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);
printf("Input string S: %s\n", inputString);
printf("Output string RS: %s\n", outputString);
// Cleanup
cudaFree(d_inputString);
cudaFree(d_outputString);
free(outputString);
return 0;
}
Output:
Input string S: PCAP
Output string RS: PCAPPCAPPCAP
3) Write a CUDA program which reads a string consisting of N words and reverse each word of it in parallel.
#include <stdio.h>
#include <stdlib.h>
__global__ void reverseWords(char* inputString, char* outputString, int* wordLengths, int numWords) {
// Iterate over each word
for (int i = index; i < numWords; i += stride) {
int wordStart = (i == 0) ? 0 : wordLengths[i - 1]; // Start index of the word
int wordEnd = wordLengths[i]; // End index of the word
// Reverse the word
for (int j = wordStart; j < (wordStart + wordEnd) / 2; j++) {
char temp = inputString[j];
outputString[j] = inputString[wordStart + wordEnd - j - 1];
outputString[wordStart + wordEnd - j - 1] = temp;
}
}
}
int main() {
char inputString[] = "Hello World CUDA Program"; // Example input string
int length = strlen(inputString);
int numWords = 0;
// Count the number of words
for (int i = 0; i < length; i++) {
if (inputString[i] == ' ') {
numWords++;
}
}
numWords++; // Increment for the last word
// Allocate memory for device copies of inputString, outputString, and wordLengths
int* d_wordLengths;
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, length);
cudaMalloc((void**)&d_wordLengths, numWords * sizeof(int));
cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);
// Calculate word lengths
int wordStartIndex = 0;
int* wordLengths = (int*)malloc(numWords * sizeof(int));
for (int i = 0; i < length; i++) {
if (inputString[i] == ' ') {
wordLengths[numWords - 1] = i - wordStartIndex; // Store length of each word
wordStartIndex = i + 1; // Move to the start of next word
numWords--;
}
}
wordLegths[numWords - 1] = length - wordStartIndex; // Length of the last word
// Copy wordLengths to device
cudaMemcpy(d_wordLengths, wordLengths, numWords * sizeof(int), cudaMemcpyHostToDevice);
// Launch reverseWords kernel on GPU with enough blocks and threads
int blockSize = 256;
int numBlocks = (length + blockSize - 1) / blockSize;
reverseWords << <numBlocks, blockSize >> > (d_inputString, d_outputString, d_wordLengths, numWords);
char* outputString = (char*)malloc(length);
cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);
printf("Input string: %s\n", inputString);
printf("Output string: %s\n", outputString);
// Cleanup
cudaFree(d_wordLengths);
free(wordLengths);
free(outputString);
return 0;
}
Output:
Input string: Hello World
Output string: olleH dlroW
4) Write a CUDA program that takes a string Sin as input and one integer value N and produces an output
string, Sout, in parallel by concatenating input string Sin, N times as shown below.
INPUT : Sin ="Hello" N =3 OUTPUT : Sout = "HelloHelloHello" Note: Every thread copies the same
character from the Input string S, N times to the required position.
#include <stdio.h>
__global__ void concatenateString(char* inputString, int inputLength, char* outputString, int repetitions) {
// Copy characters from input string to output string
for (int i = index; i < inputLength * repetitions; i += stride) {
outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly
}
}
int main() {
char inputString[] = "Hello"; // Example input string
int repetitions = 3; // Number of times input string should be repeated
int outputLength = length * repetitions; // Length of the output string
cudaMalloc((void**)&d_outputString, outputLength);
// Launch concatenateString kernel on GPU with enough blocks and threads
int numBlocks = (outputLength + blockSize - 1) / blockSize;
concatenateString <<<numBlocks, blockSize>>> (d_inputString, length, d_outputString, repetitions);
char* outputString = (char*)malloc(outputLength);
cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);
// Cleanup
free(outputString);
return 0;
}
Output:
Input string: Hello
Output string: HelloHelloHello
5) Write a CUDA program which reads a string Sin and produces an output string T as shownbelow.
Input: Sin: "Hai" Ouput: T: "Haaiii" Note:Every thread stores a character from input string Sin, required
number of times intooutput string T.
#include <stdio.h>
__global__ void repeatCharacters(char* inputString, int inputLength, char* outputString, int repetitions) {
// Copy characters from input string to output string
for (int i = index; i < inputLength * repetitions; i += stride) {
outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly
}
// Repeat the last character multiple times
if (index == 0) {
char lastChar = inputString[inputLength - 1];
for (int i = inputLength * repetitions - 1; i >= inputLength * (repetitions - 1); --i) {
outputString[i] = lastChar;
}
}
}
int main() {
char inputString[] = "Hai"; // Example input string
int repetitions = 3; // Number of times each character should be repeated, except for the last character
int outputLength = length * repetitions + 2; // Length of the output string
cudaMalloc((void**)&d_outputString, outputLength);
// Launch repeatCharacters kernel on GPU with enough blocks and threads
int numBlocks = (outputLength + blockSize - 1) / blockSize;
repeatCharacters<<<numBlocks, blockSize>>>(d_inputString, length, d_outputString, repetitions);
char* outputString = (char*)malloc(outputLength);
cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);
// Cleanup
free(outputString);
return 0;
}
Output:
Input string: Hello
Output string: Heelllllllooooo
Week 10 – 26 March 2024.

1) Write a program in CUDA to add two matrices for the following specifications:
• Each row of resultant matrix to be computed by one thread.
• Each column of resultant matrix to be computed by one thread.
• Each element of resultant matrix to be computed by one thread.
#include <stdio.h>
#define N 3 // Matrix size
// Kernel to add two matrices where each row of the resultant matrix is computed by one thread
__global__ void addMatrixRows(float* A, float* B, float* C) {
int row = blockIdx.x; // Each block computes one row
int col = threadIdx.x; // Each thread computes one element of the row
C[row * N + col] = A[row * N + col] + B[row * N + col];
}
// Kernel to add two matrices where each column of the resultant matrix is computed by one thread
__global__ void addMatrixColumns(float* A, float* B, float* C) {
int row = threadIdx.x; // Each thread computes one element of the column
int col = blockIdx.x; // Each block computes one column
C[row * N + col] = A[row * N + col] + B[row * N + col];
}
// Kernel to add two matrices where each element of the resultant matrix is computed by one thread
__global__ void addMatrixElements(float* A, float* B, float* C) {
int index = threadIdx.x + blockIdx.x * blockDim.x; // Each thread computes one element of the matrix
if (index < N * N) {
C[index] = A[index] + B[index];
}
}
// Helper function to initialize matrices with random values
void initializeMatrix(float* matrix) {
for (int i = 0; i < N * N; i++) {
matrix[i] = (float)rand() / RAND_MAX; // Random value between 0 and 1
}
}
// Helper function to print matrix
void printMatrix(float* matrix) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%.2f\t", matrix[i * N + j]);
}
printf("\n");
}
}
int main() {
// Allocate memory for host matrices
float *h_A, *h_B, *h_C;
h_A = (float*)malloc(N * N * sizeof(float));
h_B = (float*)malloc(N * N * sizeof(float));
h_C = (float*)malloc(N * N * sizeof(float));
// Initialize host matrices with random values
initializeMatrix(h_A);
initializeMatrix(h_B);
// Allocate memory for device matrices
float *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, N * N * sizeof(float));
cudaMalloc((void**)&d_B, N * N * sizeof(float));
cudaMalloc((void**)&d_C, N * N * sizeof(float));
// Copy host matrices to device
cudaMemcpy(d_A, h_A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * N * sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel to add matrices by rows
addMatrixRows<<<N, N>>>(d_A, d_B, d_C);
// Copy result matrix from device to host
cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);
// Print result matrix
printf("Matrix C (Sum of A and B by rows):\n");
printMatrix(h_C);
// Launch kernel to add matrices by columns
addMatrixColumns<<<N, N>>>(d_A, d_B, d_C);
printf("\nMatrix C (Sum of A and B by columns):\n");
printMatrix(h_C);
// Launch kernel to add matrices element-wise
addMatrixElements<<<(N * N + 255) / 256, 256>>>(d_A, d_B, d_C);
printf("\nMatrix C (Sum of A and B element-wise):\n");
printMatrix(h_C);
// Free device and host memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
Output:
Matrix A:
1.00 2.00 3.00
4.00 5.00 6.00
7.00 8.00 9.00
Matrix B:
0.34 -0.67 -2.38
-3.33 -3.66 -4.60
-6.08 -7.32 -7.47
Matrix C (Sum of A and B by rows):

1.34 1.33 0.62
0.67 1.34 1.40
0.92 1.68 1.53
Matrix C (Sum of A and B by columns):

1.34 1.33 0.62
0.67 1.34 1.40
0.92 1.68 1.53
Matrix C (Sum of A and B element-wise):

1.34 1.33 0.62
0.67 1.34 1.40
0.92 1.68 1.53
2) Write a program in CUDA to multiply two matrices for the following specifications:
• Each row of resultant matrix to be computed by one thread.
• Each column of resultant matrix to be computed by one thread.
• Each element of resultant matrix to be computed by one thread.
#include <stdio.h>
// Kernel function to multiply two matrices
__global__ void matrixMultiplication(int *a, int *b, int *c, int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index
int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index
// Check if within matrix bounds
if (row < width && col < width) {
int sum = 0;
for (int i = 0; i < width; i++) {
sum += a[row * width + i] * b[i * width + col];
}
c[row * width + col] = sum;
}
}
int main() {
int width = 4; // Width of matrices
int size = width * width * sizeof(int); // Size of matrices in bytes
// Host matrices and result matrix
int *h_a, *h_b, *h_c;
h_a = (int*)malloc(size);
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
// Initialize host matrices with sample data
for (int i = 0; i < width * width; i++) {
h_a[i] = i;
h_b[i] = i * 2;
}
// Device matrices
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy host matrices to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Define kernel launch configuration
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(width / threadsPerBlock.x, width / threadsPerBlock.y);
// Launch kernel
matrixMultiplication<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, width);
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
printf("Result Matrix:\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < width; j++) {
printf("%d ", h_c[i * width + j]);
}
printf("\n");
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Free host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Output:
Matrix A:
0123
4567
8 9 10 11
12 13 14 15
Matrix B:
0246
8 10 12 14
16 18 20 22
24 26 28 30
Result Matrix:
28 34 40 46
76 98 120 142
124 162 200 238
172 226 280 334
3) Write a CUDA program to perform linear algebra function of the form y=(alpha)x+y, where x and y are
vectors and "alpha" ia scalar value.
#include <stdio.h>
__global__ void vectorAdd(float *x, float *y, float alpha, int N) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < N) {
y[idx] = alpha * x[idx] + y[idx];
}
}
int main() {
const int N = 5;
float x[N] = {1, 2, 3, 4, 5};
float y[N] = {6, 7, 8, 9, 10};
float alpha = 2;
float *d_x, *d_y;
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
int numBlocks = (N + blockSize - 1) / blockSize;
vectorAdd<<<numBlocks, blockSize>>>(d_x, d_y, alpha, N);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
printf("Resulting vector y:\n");
for (int i = 0; i < N; ++i) {
printf("%f ", y[i]);
}
printf("\n");
return 0;
}
Output:
Input vector x:
12345
Input vector y:
6 7 8 9 10
Scalar alpha: 2
Resulting vector y:
8. 11 14 17 20
4) Write a CUDA program to sort every row of a matrix using selection sort.
#include <stdio.h>
__device__ void selectionSort(int *row, int size) {
for (int i = 0; i < size - 1; ++i) {
int minIndex = i;
for (int j = i + 1; j < size; ++j) {
if (row[j] < row[minIndex]) {
minIndex = j;
}
}
int temp = row[i];
row[i] = row[minIndex];
row[minIndex] = temp;
}
}
__global__ void sortRows(int *matrix, int rows, int cols) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < rows) {
int *rowPtr = matrix + row * cols;
selectionSort(rowPtr, cols);
}
}
int main() {
int rows = 3;
int cols = 4;
int matrix[3][4] = {{9, 7, 5, 8},
{4, 6, 2, 1},
{3, 0, 2, 5}};
int *d_matrix;
cudaMalloc((void**)&d_matrix, rows * cols * sizeof(int));
cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(int), cudaMemcpyHostToDevice);
int blockSize = 4; // Number of threads per block
int numBlocks = (rows + blockSize - 1) / blockSize;
sortRows<<<numBlocks, blockSize>>>(d_matrix, rows, cols);
cudaMemcpy(matrix, d_matrix, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_matrix);
printf("Sorted Matrix:\n");
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
printf("%d ", matrix[i][j]);
}
printf("\n");
}
return 0;
}
Output:
Input matrix:
97351
64820
59247
Sorted matrix (each row):

13579
02468
24579
5) Write a CUDA program to perform odd even transposition sort in parallel.

#include <stdio.h>
__global__ void oddEvenSort(int *arr, int n) {
int tid = threadIdx.x;
int offset = blockDim.x;
for (int i = 0; i < n; ++i) {
int partner_tid = tid ^ 1;
int temp;
if ((i + tid) % 2 == 0 && partner_tid < n) {
if (arr[tid] > arr[partner_tid]) {
temp = arr[tid];
arr[tid] = arr[partner_tid];
arr[partner_tid] = temp; } }
__syncthreads(); }
}
int main() {
int n = 10, arr[] = {9, 4, 6, 2, 8, 5, 7, 1, 3, 0};
int *d_arr;
cudaMalloc((void**)&d_arr, n * sizeof(int));
cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);
int blockSize = 10; // Number of threads per block
oddEvenSort<<<1, blockSize>>>(d_arr, n);
cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_arr);
printf("Sorted Array:\n");
for (int i = 0; i < n; ++i)
printf("%d ", arr[i]);
return 0;
}
Output:
Sorted Array:
0123456789

Week 6 10

Uploaded by

Copyright:

Available Formats

Week 6 10

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Week 6 10

Uploaded by

Copyright:

Available Formats

Mahaveer R

Week 6 – 16 February 2024.

// Gather the reversed array using a separate buffer

Week 7 – 23 February 2024.

// Send it back to the sender process

Week 8 – 1 March 2024.

Week 9 – 8 March 2024.

global void generateString(char* inputString, char* outputString, int length) {

Week 10 – 26 March 2024.

Matrix C (Sum of A and B by rows):

Matrix C (Sum of A and B by columns):

Matrix C (Sum of A and B element-wise):

Sorted matrix (each row):

5) Write a CUDA program to perform odd even transposition sort in parallel.

You might also like