Вы находитесь на странице: 1из 8

Серебряков И.А.

Отчет ПИН-12М

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define ARRAY_SIZE 3
#define MATRIX_SIZE 1024
#define COUNT_THREAD 3

__global__ void multiplyKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] * b[i];
}

// Helper function for using CUDA to add vectors in parallel.


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.


cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU
installed?");
goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, ARRAY_SIZE * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, ARRAY_SIZE * sizeof(int));


if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
Серебряков И.А. Отчет ПИН-12М
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, ARRAY_SIZE * sizeof(int));


if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

// Copy input vectors from host memory to GPU buffers.


cudaStatus = cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);


if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
multiplyKernel<<<1, ARRAY_SIZE>>>(dev_c, dev_a, dev_b);

// cudaDeviceSynchronize waits for the kernel to finish, and returns


// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching
addKernel!\n", cudaStatus);
goto Error;
}

// Copy output vector from GPU buffer to host memory.


cudaStatus = cudaMemcpy(c, dev_c, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;
}

void task1()
{
printf("TASK 1 RUN\n");
int a[ARRAY_SIZE] = { 0 };
int b[ARRAY_SIZE] = { 0 };
int c[ARRAY_SIZE] = { 0 };
cudaError_t cudaStatus = cudaSuccess;

srand(time(NULL));
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = rand();
b[i] = rand();
}

cudaStatus = multiplyWithCuda(c, a, b);


if (cudaStatus != cudaSuccess) {
Серебряков И.А. Отчет ПИН-12М
fprintf(stderr, "multiplyWithCuda failed!");
return;
}

for (int i = 0; i < ARRAY_SIZE; i++) {


printf("%d * %d = %d\n", a[i], b[i], c[i]);
}

cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return;
}
}

void task2()
{
printf("TASK 2 RUN\n");
int *a;
int *b;
int *c;

// создание двух CUDA-потоков


cudaStream_t stream[COUNT_THREAD];
for( int i=0; i < COUNT_THREAD; ++i)
cudaStreamCreate(&stream[i]);

// создание pinned-памяти массива hostPtr


unsigned int mem_size = sizeof (int) * ARRAY_SIZE;
cudaMallocHost((void**)&a, COUNT_THREAD*mem_size);
cudaMallocHost((void**)&b, COUNT_THREAD*mem_size);
cudaMallocHost((void**)&c, COUNT_THREAD*mem_size);

// резервирование на device места для массива hostPtr


int *inputDevPtrA;
int *inputDevPtrB;
int *outputDevPtr;
cudaMalloc( (void**) & inputDevPtrA, COUNT_THREAD * mem_size);
cudaMalloc( (void**) & inputDevPtrB, COUNT_THREAD * mem_size);
cudaMalloc( (void**) & outputDevPtr, COUNT_THREAD * mem_size);

// заполнение массива hostPtr


srand(time(NULL));
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = rand();
b[i] = rand();
}

// асинхронное копирование массива hostPtr на device


for( int i=0; i < COUNT_THREAD; ++i) {
cudaMemcpyAsync(inputDevPtrA +i*ARRAY_SIZE, a +i*ARRAY_SIZE, mem_size,
cudaMemcpyHostToDevice, stream[i] );
cudaMemcpyAsync(inputDevPtrB +i*ARRAY_SIZE, b +i*ARRAY_SIZE, mem_size,
cudaMemcpyHostToDevice, stream[i] );
}
// обработка массива
for( int i=0; i < COUNT_THREAD; ++i)
Серебряков И.А. Отчет ПИН-12М
multiplyKernel<<<100, 512, 0, stream[i] >>> (outputDevPtr +i*ARRAY_SIZE,
inputDevPtrA +i*ARRAY_SIZE, inputDevPtrB +i*ARRAY_SIZE);

// асинхронное копирование c device на host


for( int i=0; i < COUNT_THREAD; ++i)
cudaMemcpyAsync(c +i*ARRAY_SIZE, outputDevPtr +i*ARRAY_SIZE, mem_size,
cudaMemcpyDeviceToHost, stream[i]);

// синхронизация CUDA-потоков
cudaDeviceSynchronize();

// уничтожение CUDA-потоков
for( int i=0; i < COUNT_THREAD; ++i)
cudaStreamDestroy(stream[i]);

for (int i = 0; i < ARRAY_SIZE; i++) {


printf("%d * %d = %d\n", a[i], b[i], c[i]);
}
}

__global__ void transposeKernel(int* inData, int* outData)


{
unsigned int xIndex = blockDim.x*blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y*blockIdx.y + threadIdx.y;
unsigned int inIndex = xIndex + MATRIX_SIZE*yIndex;
unsigned int outIndex = yIndex + MATRIX_SIZE*xIndex;
outData[outIndex] = inData[inIndex];
}

// Helper function for using CUDA to add vectors in parallel.


cudaError_t transposeWithCuda(int *b, const int *a)
{
int N = 1024;
int BSX = 16;
int BSY = 16;
int *dev_a = 0;
int *dev_b = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.


cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU
installed?");
goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output) .
Серебряков И.А. Отчет ПИН-12М
cudaStatus = cudaMalloc((void**)&dev_a, MATRIX_SIZE * MATRIX_SIZE * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(int));


if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

// Copy input vectors from host memory to GPU buffers.


cudaStatus = cudaMemcpy(dev_a, a, MATRIX_SIZE * MATRIX_SIZE * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

dim3 grid = dim3(N/BSX, N/BSY, 1);


dim3 bdim = dim3(BSX, BSY, 1);

// Launch a kernel on the GPU with one thread for each element.
transposeKernel<<<grid, bdim>>>(dev_a, dev_b);

// cudaDeviceSynchronize waits for the kernel to finish, and returns


// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching
addKernel!\n", cudaStatus);
goto Error;
}

// Copy output vector from GPU buffer to host memory.


cudaStatus = cudaMemcpy(b, dev_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;
}

int getIndex(int * a, int x, int y, int size)


{
return a[x*size+y];
}

bool transponize(int* a, int *b, int size)


{
for(int i = 0; i<size; i++) {
for(int j = 0; j<size; j++) {
int q = getIndex(a, i, j, size);
int w = getIndex(b, j, i, size);
Серебряков И.А. Отчет ПИН-12М
if (q != w){
return false;
}
}
}
return true;
}

void task3()
{
printf("TASK 3 RUN\n");
int *a;
int *b;
cudaError_t cudaStatus = cudaSuccess;

a = (int*) malloc(sizeof(int) * MATRIX_SIZE * MATRIX_SIZE);


b = (int*) malloc(sizeof(int) * MATRIX_SIZE * MATRIX_SIZE);

srand(time(NULL));
for (int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; i++){
a[i] = rand();
}

cudaStatus = transposeWithCuda(b, a);


if (cudaStatus != cudaSuccess) {
fprintf(stderr, "transposeWithCuda failed!");
return;
}

bool qwe = transponize(a, b, MATRIX_SIZE);

cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return;
}

free(a);
free(b);
}

int main()
{
task1();
task2();
task3();

return 0;
}
Серебряков И.А. Отчет ПИН-12М

Время для разного числа потоков во втором задании:

void task2()
{
printf("TASK 2 RUN\n");
int *a;
int *b;
int *c;

cudaEvent_t start, stop;


cudaEventCreate(&start);
cudaEventCreate(&stop);

// создание двух CUDA-потоков


cudaStream_t stream[COUNT_THREAD];
for( int i=0; i < COUNT_THREAD; ++i)
cudaStreamCreate(&stream[i]);

// создание pinned-памяти массива hostPtr


unsigned int mem_size = sizeof (int) * ARRAY_SIZE;
cudaMallocHost((void**)&a, COUNT_THREAD*mem_size);
cudaMallocHost((void**)&b, COUNT_THREAD*mem_size);
cudaMallocHost((void**)&c, COUNT_THREAD*mem_size);

// резервирование на device места для массива hostPtr


int *inputDevPtrA;
int *inputDevPtrB;
int *outputDevPtr;
cudaMalloc( (void**) & inputDevPtrA, COUNT_THREAD * mem_size);
cudaMalloc( (void**) & inputDevPtrB, COUNT_THREAD * mem_size);
cudaMalloc( (void**) & outputDevPtr, COUNT_THREAD * mem_size);

// заполнение массива hostPtr


srand(time(NULL));
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = rand();
b[i] = rand();
}
cudaEventRecord(start, 0);
// асинхронное копирование массива hostPtr на device
for( int i=0; i < COUNT_THREAD; ++i) {
cudaMemcpyAsync(inputDevPtrA +i*ARRAY_SIZE, a +i*ARRAY_SIZE, mem_size,
cudaMemcpyHostToDevice, stream[i] );
cudaMemcpyAsync(inputDevPtrB +i*ARRAY_SIZE, b +i*ARRAY_SIZE, mem_size,
cudaMemcpyHostToDevice, stream[i] );
}

// обработка массива
for( int i=0; i < COUNT_THREAD; ++i)
multiplyKernel<<<100, 512, 0, stream[i] >>> (outputDevPtr +i*ARRAY_SIZE,
inputDevPtrA +i*ARRAY_SIZE, inputDevPtrB +i*ARRAY_SIZE);

// асинхронное копирование c device на host


for( int i=0; i < COUNT_THREAD; ++i)
cudaMemcpyAsync(c +i*ARRAY_SIZE, outputDevPtr +i*ARRAY_SIZE, mem_size,
cudaMemcpyDeviceToHost, stream[i]);

// синхронизация CUDA-потоков
cudaDeviceSynchronize();

// уничтожение CUDA-потоков
for( int i=0; i < COUNT_THREAD; ++i)
Серебряков И.А. Отчет ПИН-12М
cudaStreamDestroy(stream[i]);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time = 0.0;
cudaEventElapsedTime(&time, start, stop);

printf("time: %.5f\ncount thread: %d\narray size: %d", time, COUNT_THREAD,


ARRAY_SIZE);

Вам также может понравиться