Week 6 10
Week 6 10
Week 6 10
Batch 4
Parallel Programming Lab
04 April 2024.
Process 2: pow(40, 2) = 1600.000000
Process 1: pow(40, 1) = 40.000000
Process 0: pow(40, 0) = 1.000000
Process 4: pow(40, 4) = 2560000.000000
Process 3: pow(40, 3) = 64000.000000
2) Write a program in MPI where even ranked process prints "Hello" and odd ranked process prints "World".
#include <stdio.h>
#include <mpi.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_rank % 2 == 0) {
printf(" EVEN Hello from process %d\n", world_rank);
else {
printf("ODD World from process %d\n", world_rank);
return 0;
Odd World from process 13
Even Hello from process 2
Odd World from process 9
Odd World from process 1
Even Hello from process 8
Even Hello from process 12
Odd World from process 5
Odd World from process 3
Odd World from process 11
Even Hello from process 6
Even Hello from process 0
Even Hello from process 4
Odd World from process 7
Even Hello from process 10
3) Write a program in MPI to simulate simple calculator. Perform each operation using different process in
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
char operations[] = { '+', '-', '*', '/' };
char operation = operations[world_rank % 4];
double operand1 = 10.0, operand2 = 2.0;
double result = 0.0;
switch (operation) {
case '+':
result = operand1 + operand2;
case '-':
result = operand1 - operand2;
case '*':
result = operand1 * operand2;
case '/':
if (operand2 != 0) {
result = operand1 / operand2;
else {
fprintf(stderr, "Error: Division by zero.\n");
fprintf(stderr, "Error: Unknown operation.\n");
// Allocate memory dynamically for gathered_results
double* gathered_results = (double*)malloc(world_size * sizeof(double));
MPI_Gather(&result, 1, MPI_DOUBLE, gathered_results, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (world_rank == 0) {
for (int i = 0; i < world_size; ++i) {
printf("Process %d: %.2f %c %.2f = %.2f\n", i, operand1, operations[i % 4], operand2, gathered_results[i]);
// Free dynamically allocated memory
return 0;
Process 0: 10.00 + 2.00 = 12.00
Process 1: 10.00 - 2.00 = 8.00
Process 2: 10.00 * 2.00 = 20.00
Process 3: 10.00 / 2.00 = 5.00
Process 4: 10.00 + 2.00 = 12.00
Process 5: 10.00 - 2.00 = 8.00
Process 6: 10.00 * 2.00 = 20.00
Process 7: 10.00 / 2.00 = 5.00
Process 8: 10.00 + 2.00 = 12.00
Process 9: 10.00 - 2.00 = 8.00
Process 10: 10.00 * 2.00 = 20.00
Process 11: 10.00 / 2.00 = 5.00
Process 12: 10.00 + 2.00 = 12.00
Process 13: 10.00 - 2.00 = 8.00
4) Write a program in MPI to toggle the character of a given string indexed by the rank of the process.
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#define MAX_STRING_SIZE 100
int main(int argc, char *argv[]) {
int rank, size;
char str[MAX_STRING_SIZE] = "HeLLO";
int str_len;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
str_len = strlen(str);
// Determine the character index to toggle based on process rank
int char_index = rank % str_len;
// Toggle the character
if (rank < str_len) {
if (str[char_index] >= 'a' && str[char_index] <= 'z') {
str[char_index] = str[char_index] - 32; // Convert to uppercase
} else if (str[char_index] >= 'A' && str[char_index] <= 'Z') {
str[char_index] = str[char_index] + 32; // Convert to lowercase
// Gather all modified strings to process 0
MPI_Gather(rank < str_len ? &str[char_index] : NULL, 1, MPI_CHAR,
// Print the result in process 0
if (rank == 0) {
printf("Original String: HeLLO\n");
printf("Modified String: %s\n", str);
return 0;
Original String: ThisIsALongerString
Modified String: tHiSiSaLoNgErStRiNg
5) Write a program in MPI to reverse the digits of the following integer array of size 9 with 9 processes.
Initialize the Input array to the following values.
Input array : 18, 523, 301, 1234, 2, 14, 108, 150, 1928 Output array: 81, 325, 103, 4321, 2, 41, 801, 51, 8291
#include <stdio.h>
#include <mpi.h>
#define ARRAY_SIZE 9
int reverseDigits(int num) {
int reversed = 0;
while (num > 0) {
reversed = reversed * 10 + num % 10;
num /= 10;
return reversed;
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != ARRAY_SIZE) {
if (rank == 0) {
fprintf(stderr, "Please run the program with exactly 9 processes.\n");
return 1;
int inputArray[ARRAY_SIZE] = { 18, 523, 301, 1234, 2, 14, 108, 150, 1928 };
int outputArray[ARRAY_SIZE];
int gatheredArray[ARRAY_SIZE];
// Scatter the input array among processes
MPI_Scatter(inputArray, 1, MPI_INT, &outputArray[rank], 1, MPI_INT, 0, MPI_COMM_WORLD);
// Reverse the digits
outputArray[rank] = reverseDigits(outputArray[rank]);
Input array : 18 523 301 1234 2 14 108 150 1928
Output array: 81 325 103 4321 2 41 801 51 8291
6) Write a MPI program to find the prime numbers between 1 and 100 using 2 processes.
#include <stdio.h>
#include <stdbool.h>
#include <mpi.h>
#define RANGE_START 1
#define RANGE_END 100
bool isPrime(int num) {
if (num < 2) {
return false;
for (int i = 2; i * i <= num; i++) {
if (num % i == 0) {
return false;
return true;
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != 2) {
if (rank == 0) {
fprintf(stderr, "Please run the program with exactly 2 processes.\n");
return 1;
int start, end;
int primesInRange = 0;
if (rank == 0) {
// Process 0 will check for primes in the first half of the range
start = RANGE_START;
end = RANGE_END / 2;
else {
// Process 1 will check for primes in the second half of the range
start = RANGE_END / 2 + 1;
end = RANGE_END;
// Each process checks for prime numbers in its assigned range
for (int num = start; num <= end; num++) {
if (isPrime(num)) {
// Process 0 receives the count of primes from Process 1 and adds them
if (rank == 0) {
int receivedPrimes;
MPI_Recv(&receivedPrimes, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
primesInRange += receivedPrimes;
printf("Prime numbers between %d and %d: %d\n", RANGE_START, RANGE_END, primesInRange);
else {
// Process 1 sends its count of primes to Process 0
MPI_Send(&primesInRange, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
return 0;
Prime numbers between 1 and 100: 25
Process 1 received word: Hello
Process 0 sends word: Hello
Process 0 received toggled word: hELLO
2) Write a MPI program where the master process (process 0) sends a number to each of the slaves and the
slave processes receive the number and prints it. Use standard send.
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int number;
if (world_rank == 0) {
// Master process
// Choose a number to send to all slave processes
number = 777; // You can change this number to any number you want to send
// Use MPI_Send to send it to all the other processes
for (int i = 1; i < world_size; i++) {
MPI_Send(&number, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
printf("Master process sending number %d to process %d\n", number, i);
} else {
// Slave processes
printf("Slave process %d received number %d from master process\n", world_rank, number);
return 0;
Slave process 3 received number 123 from master process
Slave process 2 received number 123 from master process
Slave process 1 received number 123 from master process
Master process sending number 123 to process 1
Master process sending number 123 to process 2
Master process sending number 123 to process 3
3) Write a MPI program to read N elements of the array in the root process (process 0) where N is equal to the total
number of process. The root process sends one value to each of the slaves. Let even ranked process finds square of the
received element and odd ranked process finds cube of received element. Use Buffered send.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[]) {
MPI_Init(&argc, &argv);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// The root process (process 0) will read N elements where N is equal to world_size
int *numbers = NULL;
if (world_rank == 0) {
numbers = (int*)malloc(sizeof(int) * world_size);
// Initialize the array with some values
for (int i = 0; i < world_size; i++) {
numbers[i] = i + 1; // Or any other logic to initialize the array
// Buffer for buffered send
int buffer_size = world_size * sizeof(int) + MPI_BSEND_OVERHEAD;
void *buffer = malloc(buffer_size);
MPI_Buffer_attach(buffer, buffer_size);
// Distribute one number to each slave process
if (world_rank == 0) {
for (int i = 1; i < world_size; i++) {
MPI_Bsend(&numbers[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
// Each slave process receives a number and performs its operation
int received_number;
if (world_rank != 0) {
MPI_Recv(&received_number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
if (world_rank % 2 == 0) {
// Even rank: square the number
printf("Process %d received %d, squaring it to %d\n", world_rank, received_number, received_number *
} else {
// Odd rank: cube the number
printf("Process %d received %d, cubing it to %d\n", world_rank, received_number, received_number *
received_number * received_number);
// Detach and free the buffer
MPI_Buffer_detach(&buffer, &buffer_size);
// Free numbers array on root process
if(world_rank == 0) {
return 0;
Process 1 received 2, cubing it to 8
Process 3 received 4, cubing it to 64
Process 2 received 3, squaring it to 9
4) Write a MPI program to read an integer value in the root process. Root process sends this value to Process1,
Process1 sends this value to Process2 and so on. Last process sends the value back to root process. When
sending the value each process will first increment the received value by one. Write the program using point to
point communication routines.
#include <stdio.h>
#include <mpi.h>
int main(int argc, char** argv) {
int value;
int world_rank, world_size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank == 0) {
// Root process reads the integer value
value = 10; // Example value, can be read from user input
printf("Root process starts with value: %d\n", value);
MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);
} else {
MPI_Recv(&value, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
value++; // Increment the value by one
printf("Process %d incremented value to: %d\n", world_rank, value);
if (world_rank < world_size - 1) {
MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);
} else {
// Last process sends it back to the root process
MPI_Send(&value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
// Root process receives the value from the last process
if (world_rank == 0) {
MPI_Recv(&value, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Root process received final value: %d\n", value);
return 0;
Process 1 incremented value to: 11
Root process starts with value: 10
Root process received final value: 13
Process 3 incremented value to: 13
Process 2 incremented value to: 12
5) Write a MPI program to read N elements of an array in the master process. Let N processes including master
process check the array values are prime or not.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
// Function to check if a number is prime
int is_prime(int number) {
if (number <= 1) return 0;
for (int i = 2; i * i <= number; i++) {
if (number % i == 0) return 0;
return 1;
int main(int argc, char** argv) {
int world_rank, world_size, number_to_check;
int* array = NULL; // Initialize the pointer to NULL
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank == 0) {
array = (int*)malloc(sizeof(int) * world_size); // Allocate memory only on the root process
for (int i = 0; i < world_size; i++) {
array[i] = i + 2; // Example values, start from 2 (first prime number)
// Use MPI_Scatter to distribute the values
MPI_Scatter(array, 1, MPI_INT, &number_to_check, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Each process checks if the number received is prime
int result = is_prime(number_to_check);
printf("Process %d received %d, prime? %s\n", world_rank, number_to_check, result ? "Yes" : "No");
// Free the allocated memory on the root process
if (world_rank == 0) {
return 0;
Process 1 received 3, prime? Yes
Process 2 received 4, prime? No
Process 0 received 2, prime? Yes
Process 3 received 5, prime? Yes
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
// Initialize arrays
for (int i = 0; i < N; i++) {
x[i] = 5;
y[i] = 2;
// Copy inputs to device
cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add << <1, N >> > (d_x, d_y, d_z, N);
// Copy result back to host
cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result
printf("Result: ");
printVector(z, N);
// Cleanup
cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
free(x); free(y); free(z);
return 0;
Result: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
__global__ void add(int* x, int* y, int* z, int n) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n)
z[index] = x[index] + y[index];
void printVector(int* vector, int size) {
for (int i = 0; i < size; i++) {
printf("%d ", vector[i]);
int main() {
int N = 2048; // Example size
int* x, * y, * z, * d_x, * d_y, * d_z;
x = (int*)malloc(N * sizeof(int));
y = (int*)malloc(N * sizeof(int));
z = (int*)malloc(N * sizeof(int));
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
for (int i = 0; i < N; i++) {
x[i] = 5;
y[i] = 2;
cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);
int blockSize = 256; // Choose appropriate block size
int numBlocks = (N + blockSize - 1) / blockSize;
add << <numBlocks, blockSize >> > (d_x, d_y, d_z, N);
cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);
printf("Result: ");
printVector(z, N);
cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
free(x); free(y); free(z);
return 0;
Results: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
2) Implement a CUDA program to add two vectors of length N by keeping the number of threads per block as
256 (constant) and vary the number of blocks to handle N elements.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
// CUDA kernel to add elements of two arrays
__global__ void add(int* a, int* b, int* c, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < N)
c[index] = a[index] + b[index];
int main() {
int N = 1 << 20; // Example: Number of elements in each vector (1 Million elements)
int size = N * sizeof(int);
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; // Calculate needed blocks
// Allocate memory on the host
int* h_a, * h_b, * h_c;
h_a = (int*)malloc(size);
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
// Initialize vectors on the host
for (int i = 0; i < N; i++) {
h_a[i] = i;
h_b[i] = i;
// Allocate vectors in device memory
int* d_a, * d_b, * d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Invoke kernel
add << <blocksPerGrid, threadsPerBlock >> > (d_a, d_b, d_c, N);
// Copy result from device memory to host memory
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Print the results: Print the first 10 elements
printf("Result of Vector Addition (First 10 Elements):\n");
for (int i = 0; i < 10; i++) {
printf("h_c[%d] = %d\n", i, h_c[i]);
// Free device memory
// Free host memory
return 0;
Result of Vector Addition (First 10 Elements):
h_c[0] = 0
h_c[1] = 2
h_c[2] = 4
h_c[3] = 6
h_c[4] = 8
h_c[5] = 10
h_c[6] = 12
h_c[7] = 14
h_c[8] = 16
h_c[9] = 18
3) Write a program in CUDA which performs convolution operation on one dimensional input array N of size width using
a mask array M of size mask_width to produce the resultant one-dimensional array P of size width.
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
// CUDA Kernel for one-dimensional convolution
__global__ void convolution_1d(int* N, int* M, int* P, int width, int mask_width) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k;
// Each thread computes one element of P
if (i < width) {
int pValue = 0;
// Convolution operation
for (k = 0; k < mask_width; ++k) {
int maskIndex = mask_width - 1 - k;
int nIndex = i - (mask_width / 2) + k;
if (nIndex >= 0 && nIndex < width) {
pValue += N[nIndex] * M[maskIndex];
P[i] = pValue;
// Function to print the array
void printArray(int* array, int size) {
for (int i = 0; i < size; i++) {
printf("%d ", array[i]);
int main() {
int width = 10; // Example array size
int mask_width = 3; // Example mask size
int N[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; // Example input array
int M[3] = { 1, 0, -1 }; // Example mask array
int P[10]; // Resultant array
int* d_N, * d_M, * d_P;
// Allocate memory on the device
cudaMalloc(&d_N, width * sizeof(int));
cudaMalloc(&d_M, mask_width * sizeof(int));
cudaMalloc(&d_P, width * sizeof(int));
// Copy inputs to device
cudaMemcpy(d_N, N, width * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_M, M, mask_width * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel on the GPU
convolution_1d << <(width + 255) / 256, 256 >> > (d_N, d_M, d_P, width, mask_width);
// Copy result back to host
cudaMemcpy(P, d_P, width * sizeof(int), cudaMemcpyDeviceToHost);
// Print the resultant array
printf("Resultant array: ");
printArray(P, width);
// Free device memory
return 0;
Resultant array: 2 2 2 2 2 2 2 2 2 -9
4) Write a program in CUDA to process a ID array containing angles in radians to generate sine of the angles in
the output array. Use appropriate function.
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
// Define M_PI if it's not defined by the math library
#ifndef M_PI
#define M_PI 3.14159265358979323846
// CUDA Kernel to compute sine of angles in radians
__global__ void compute_sine(float* input, float* output, int n) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
output[index] = sinf(input[index]); // Use sinf for single precision float
// Function to print the array
void printArray(float* array, int size) {
for (int i = 0; i < size; i++) {
printf("%f ", array[i]);
int main() {
int n = 5; // Example array size
float inputAngles[] = { 0.0, M_PI / 6, M_PI / 4, M_PI / 2, M_PI }; // Example angles in radians
// Allocate memory on the device
float* d_input, * d_output;
cudaMalloc((void**)&d_input, n * sizeof(float));
cudaMalloc((void**)&d_output, n * sizeof(float));
// Copy the input array from host to device
cudaMemcpy(d_input, inputAngles, n * sizeof(float), cudaMemcpyHostToDevice);
// Calculate the number of blocks and threads per block
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
// Allocate memory for the output array on the host
float* output = new float[n]; // Use dynamic allocation
// Launch the CUDA Kernel
compute_sine << <blocksPerGrid, threadsPerBlock >> > (d_input, d_output, n);
// Copy the result back to the host
cudaMemcpy(output, d_output, n * sizeof(float), cudaMemcpyDeviceToHost);
// Print the resultant array
printf("Sine of angles: ");
printArray(output, n);
// Free device memory
// Free host memory
delete[] output;
return 0;
Sine of angles: 0.000000 0.500000 0.707107 1.000000 -0.000000
The word ‘world’ appears 4 times in the sentence.
2) Write a CUDA program that reads a string S and produces the string RS as follows:
Input string S: PCAP Output string RS: PCAPPCAPCP Note: Each work item copies required number of
characters from S in RS.
#include <stdio.h>
#include <cuda_runtime.h>
Input string S: PCAP
Output string RS: PCAPPCAPPCAP
3) Write a CUDA program which reads a string consisting of N words and reverse each word of it in parallel.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void reverseWords(char* inputString, char* outputString, int* wordLengths, int numWords) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
// Iterate over each word
for (int i = index; i < numWords; i += stride) {
int wordStart = (i == 0) ? 0 : wordLengths[i - 1]; // Start index of the word
int wordEnd = wordLengths[i]; // End index of the word
// Reverse the word
for (int j = wordStart; j < (wordStart + wordEnd) / 2; j++) {
char temp = inputString[j];
outputString[j] = inputString[wordStart + wordEnd - j - 1];
outputString[wordStart + wordEnd - j - 1] = temp;
int main() {
char inputString[] = "Hello World CUDA Program"; // Example input string
int length = strlen(inputString);
int numWords = 0;
// Count the number of words
for (int i = 0; i < length; i++) {
if (inputString[i] == ' ') {
numWords++; // Increment for the last word
// Allocate memory for device copies of inputString, outputString, and wordLengths
char* d_inputString, * d_outputString;
int* d_wordLengths;
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, length);
cudaMalloc((void**)&d_wordLengths, numWords * sizeof(int));
// Copy inputString to device
cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);
// Calculate word lengths
int wordStartIndex = 0;
int* wordLengths = (int*)malloc(numWords * sizeof(int));
for (int i = 0; i < length; i++) {
if (inputString[i] == ' ') {
wordLengths[numWords - 1] = i - wordStartIndex; // Store length of each word
wordStartIndex = i + 1; // Move to the start of next word
wordLegths[numWords - 1] = length - wordStartIndex; // Length of the last word
// Copy wordLengths to device
cudaMemcpy(d_wordLengths, wordLengths, numWords * sizeof(int), cudaMemcpyHostToDevice);
// Launch reverseWords kernel on GPU with enough blocks and threads
int blockSize = 256;
int numBlocks = (length + blockSize - 1) / blockSize;
reverseWords << <numBlocks, blockSize >> > (d_inputString, d_outputString, d_wordLengths, numWords);
// Copy result back to host
char* outputString = (char*)malloc(length);
cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);
printf("Input string: %s\n", inputString);
printf("Output string: %s\n", outputString);
// Cleanup
return 0;
Input string: Hello World
Output string: olleH dlroW
4) Write a CUDA program that takes a string Sin as input and one integer value N and produces an output
string, Sout, in parallel by concatenating input string Sin, N times as shown below.
INPUT : Sin ="Hello" N =3 OUTPUT : Sout = "HelloHelloHello" Note: Every thread copies the same
character from the Input string S, N times to the required position.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void concatenateString(char* inputString, int inputLength, char* outputString, int repetitions) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
// Copy characters from input string to output string
for (int i = index; i < inputLength * repetitions; i += stride) {
outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly
int main() {
char inputString[] = "Hello"; // Example input string
int length = strlen(inputString);
int repetitions = 3; // Number of times input string should be repeated
// Allocate memory for device copies of inputString and outputString
char* d_inputString, * d_outputString;
int outputLength = length * repetitions; // Length of the output string
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, outputLength);
// Copy inputString to device
cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);
// Launch concatenateString kernel on GPU with enough blocks and threads
int blockSize = 256;
int numBlocks = (outputLength + blockSize - 1) / blockSize;
concatenateString <<<numBlocks, blockSize>>> (d_inputString, length, d_outputString, repetitions);
// Copy result back to host
char* outputString = (char*)malloc(outputLength);
cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);
printf("Input string: %s\n", inputString);
printf("Output string: %s\n", outputString);
// Cleanup
return 0;
Input string: Hello
Output string: HelloHelloHello
5) Write a CUDA program which reads a string Sin and produces an output string T as shownbelow.
Input: Sin: "Hai" Ouput: T: "Haaiii" Note:Every thread stores a character from input string Sin, required
number of times intooutput string T.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void repeatCharacters(char* inputString, int inputLength, char* outputString, int repetitions) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
// Copy characters from input string to output string
for (int i = index; i < inputLength * repetitions; i += stride) {
outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly
// Repeat the last character multiple times
if (index == 0) {
char lastChar = inputString[inputLength - 1];
for (int i = inputLength * repetitions - 1; i >= inputLength * (repetitions - 1); --i) {
outputString[i] = lastChar;
int main() {
char inputString[] = "Hai"; // Example input string
int length = strlen(inputString);
int repetitions = 3; // Number of times each character should be repeated, except for the last character
// Allocate memory for device copies of inputString and outputString
char* d_inputString, * d_outputString;
int outputLength = length * repetitions + 2; // Length of the output string
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, outputLength);
// Copy inputString to device
cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);
// Launch repeatCharacters kernel on GPU with enough blocks and threads
int blockSize = 256;
int numBlocks = (outputLength + blockSize - 1) / blockSize;
repeatCharacters<<<numBlocks, blockSize>>>(d_inputString, length, d_outputString, repetitions);
// Copy result back to host
char* outputString = (char*)malloc(outputLength);
cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);
printf("Input string: %s\n", inputString);
printf("Output string: %s\n", outputString);
// Cleanup
return 0;
Input string: Hello
Output string: Heelllllllooooo
Matrix A:
1.00 2.00 3.00
4.00 5.00 6.00
7.00 8.00 9.00
Matrix B:
0.34 -0.67 -2.38
-3.33 -3.66 -4.60
-6.08 -7.32 -7.47
2) Write a program in CUDA to multiply two matrices for the following specifications:
• Each row of resultant matrix to be computed by one thread.
• Each column of resultant matrix to be computed by one thread.
• Each element of resultant matrix to be computed by one thread.
#include <stdio.h>
#include <cuda_runtime.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Kernel function to multiply two matrices
__global__ void matrixMultiplication(int *a, int *b, int *c, int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index
int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index
// Check if within matrix bounds
if (row < width && col < width) {
int sum = 0;
for (int i = 0; i < width; i++) {
sum += a[row * width + i] * b[i * width + col];
c[row * width + col] = sum;
int main() {
int width = 4; // Width of matrices
int size = width * width * sizeof(int); // Size of matrices in bytes
// Host matrices and result matrix
int *h_a, *h_b, *h_c;
h_a = (int*)malloc(size);
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
// Initialize host matrices with sample data
for (int i = 0; i < width * width; i++) {
h_a[i] = i;
h_b[i] = i * 2;
// Device matrices
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy host matrices to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Define kernel launch configuration
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(width / threadsPerBlock.x, width / threadsPerBlock.y);
// Launch kernel
matrixMultiplication<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, width);
// Copy result matrix from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Print result matrix
printf("Result Matrix:\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < width; j++) {
printf("%d ", h_c[i * width + j]);
// Free device memory
// Free host memory
return 0;
Matrix A:
8 9 10 11
12 13 14 15
Matrix B:
8 10 12 14
16 18 20 22
24 26 28 30
Result Matrix:
28 34 40 46
76 98 120 142
124 162 200 238
172 226 280 334
3) Write a CUDA program to perform linear algebra function of the form y=(alpha)x+y, where x and y are
vectors and "alpha" ia scalar value.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void vectorAdd(float *x, float *y, float alpha, int N) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < N) {
y[idx] = alpha * x[idx] + y[idx];
int main() {
const int N = 5;
float x[N] = {1, 2, 3, 4, 5};
float y[N] = {6, 7, 8, 9, 10};
float alpha = 2;
float *d_x, *d_y;
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
vectorAdd<<<numBlocks, blockSize>>>(d_x, d_y, alpha, N);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
printf("Resulting vector y:\n");
for (int i = 0; i < N; ++i) {
printf("%f ", y[i]);
return 0;
Input vector x:
Input vector y:
6 7 8 9 10
Scalar alpha: 2
Resulting vector y:
8. 11 14 17 20
4) Write a CUDA program to sort every row of a matrix using selection sort.
#include <stdio.h>
#include <cuda_runtime.h>
__device__ void selectionSort(int *row, int size) {
for (int i = 0; i < size - 1; ++i) {
int minIndex = i;
for (int j = i + 1; j < size; ++j) {
if (row[j] < row[minIndex]) {
minIndex = j;
int temp = row[i];
row[i] = row[minIndex];
row[minIndex] = temp;
__global__ void sortRows(int *matrix, int rows, int cols) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < rows) {
int *rowPtr = matrix + row * cols;
selectionSort(rowPtr, cols);
int main() {
int rows = 3;
int cols = 4;
int matrix[3][4] = {{9, 7, 5, 8},
{4, 6, 2, 1},
{3, 0, 2, 5}};
int *d_matrix;
cudaMalloc((void**)&d_matrix, rows * cols * sizeof(int));
cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(int), cudaMemcpyHostToDevice);
int blockSize = 4; // Number of threads per block
int numBlocks = (rows + blockSize - 1) / blockSize;
sortRows<<<numBlocks, blockSize>>>(d_matrix, rows, cols);
cudaMemcpy(matrix, d_matrix, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);
printf("Sorted Matrix:\n");
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
printf("%d ", matrix[i][j]);
return 0;
Input matrix:
Sorted Array: