CUDA Comparison

Helder Eiki Oshiro
80568704
MIDTERM 1
It was made 5 versions of the program:
The sequential, processed by CPU

Simple kernel, using global memory and processed in GPU
Using constant memory to store the mask and global memory to store the image
Using shared memory to store the image
Using tiles (shared memory) and constant memory
The computation time can be seen on the tables above. The processing time was measured in microseconds,
and was timed just on the convolution function, not considering the reading/saving file time. It was used the
nvprof tool for measuring the kernel execution time, and gettimeofday() for the sequential code. In both case,
was used a computer equipped with CPU i5-6300 HQ and a GPU nVidia GTX960m.
CPU
Sharpen
Gaussian
Apollo
Lena
2,660,925
39,252
2,558,420
42,079
Tiled only
Sharpen
Gaussian
Apollo
Simple Kernel
Sharpen
Gaussian
Apollo
Tiling + cnst memory

Sharpen
Gaussian
Apollo
Const memory only

Sharpen
Gaussian
Apollo
Lena
20,861
20,860
385
384
Lena
18,465
18,462
294
291
Lena
10,550
10,578
166
164
Lena
6,533
6,526
Figure 1 Timing for each version, measured in microseconds
The results were plotted in the graph below.
102
100
APOLLO.BMP (MICROSECONDS)
Gaussian
TILED ONLY
SIMPLE
TILED + CONST
MEMORY
6,526
6,533
10,578
10,550
18,462
18,465
20,860
20,861
Sharpen
CONSTANT MEMORY
ONLY
LENA.BMP (MICROSECONDS)
Gaussian
TILED ONLY
SIMPLE
TILED + CONST
MEMORY
100
102
164
166
291
294
384
385
Sharpen
CONSTANT MEMORY
ONLY
As we can see, the version using constant memory only is the fastest, meanwhile the kernel using shared
memory is the slowest.
Analyzing this behavior we can presume that it does not worth the effort transferring the image from
global memory to shared memory due the overhead time spent. Each element to be computed requires 25
(number of elements in the mask) memory access in the image matrix and 25 access into the mask matrix. In
comparison to the multiplication kernel, which has the performance very improved by using tiling technique, each
element to be computed needed number_of_collumns + number_of_rows memory access.
Another possible explanation for the constant memory version be the fastest is that, since each thread
uses many in common elements between its neighbors, some elements might be loaded into the cache memory,
compensating the no use of shared memory. Furthermore it does not spend time loading into shared memory.
SEQUENTIAL CODE
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include "cips.h"
main (argc, argv)
int argc;
char *argv[];
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf ("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit (0);
}
strcpy (name, argv[1]);
strcpy (name2, argv[2]);
out_length = atoi (argv[3]);
out_width = atoi (argv[4]);
if (argc > 5) {
il = atoi (argv[5]);
ie = atoi (argv[6]);
}
if (does_not_exist (name)) {
printf ("\nERROR input file %s does not exist", name);
exit (0);
}
get_image_size (name, &in_length, &in_width);
the_image = (short **) allocate_image_array (in_length, in_width);
read_image_array (name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp (name)) {
read_bmp_file_header (name, &bmp_file_header);
read_bm_header (name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file (name2, &bmp_file_header, &bmheader);
}
out_image = (short **) allocate_image_array (out_length, out_width);
/******************************************
*
* Copy the input image array to the output
* image array per the input parameters.
*
******************************************/
//int mask[5][5] = { { -1, -1, -1, -1, -1 }, { -1, 2, 2, 2, -1 }, { -1, 2, 8, 2, -1 }, { -1,
2, 2, 2, -1 }, { -1, -1, -1, -1, -1 } }; //Sharpen
int mask[5][5] = { { 1, 4, 6, 4, 1 }, { 4, 16, 24, 16, 4 }, { 6, 24, 36, 24, 6 }, { 4, 16, 24,
16, 4 }, { 1, 4, 6, 4, 1 } };
//Gaussian
int m, n;
struct timeval start, end;
gettimeofday(&start, NULL);
for (i = 2; i < out_length - 3; i++)
for (j = 2; j < out_width - 3; j++){
int acc = 0 ;
for (m = 0; m < 5; m++){
for (n = 0; n < 5; n++){
acc += (float) mask[m][n] * the_image[i - 2 + m][j + - 2 +n];
}
}
acc = acc / 256;
if (acc > 255) acc = 255;
out_image[i][j] = acc;
}
gettimeofday(&end, NULL);
unsigned long endtime_in_micros = 1000000 * end.tv_sec + end.tv_usec;
unsigned long starttime_in_micros = 1000000 * start.tv_sec + start.tv_usec;
printf("%ld\n", ((endtime_in_micros) - (starttime_in_micros)));
write_image_array (name2, out_image);
free_image_array (out_image, out_length);
free_image_array (the_image, in_length);
}
/* ends main */
SIMPLE KERNEL
***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);
__global__ void GaussConvolution(short *N, short *P, short *Mask, int mask_elements, int width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
P[row * width + col] = (short)(acc/256);
if (P[row * width + col] > 255)
P[row * width + col] = 255;
}
else{
}
}
__global__ void SharpenConvolution(short *N, short *P, short *Mask, int mask_elements, int
width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
}
else{
}
}
__constant__ short sharpen_ptr[25];
//__constant__ short gauss_ptr;
int main(int argc, char *argv[])
{
char response[80];
int sat;
/******************************************
*
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
"\n"
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************
*
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);
//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;

//Input image
short *d_B; //Output image
short *d_C; //Gaussian Mask
short *d_D; //Sharpen Mask
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
//Allocating memory to receive output on host
int sizeoutput = out_length * out_width * sizeof(short);
int sizeinput = out_length * out_width * sizeof(short);
short *out_image2;
out_image2 = (short*)malloc(sizeoutput);
//Converting image into a 1D array
short *the_image2;
the_image2 = (short*)malloc(sizeinput);
int m;
int n;
for (m = 0; m < out_width; m++){
for (n = 0; n < out_width; n++){
the_image2[m * out_width + n] = the_image[m][n];
}
}
//Allocating memory in device
//cudaMemcpyToSymbol(gauss_ptr, gauss, 25*sizeof(short));
cudaMalloc((void**)&d_D, 25*sizeof(short));
cudaMalloc((void**)&d_C, 25*sizeof(short));
cudaMalloc((void**)&d_A, sizeinput);
cudaMalloc((void**)&d_B, sizeoutput);
//Copying to device
cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_D, sharpen, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_A, the_image2, sizeinput, cudaMemcpyHostToDevice);
//-----------------------DEFINING DIMENSIONS----------------------dim3 dimGrid(16, 16, 1);
//128, 128 (Apollo) - 16, 16 (Lena)
dim3 dimBlock(32, 32, 1); //32, 32 (Apollo) - 32, 32 (Lena)
//-----------------------APPLY FILTER ----------------------GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_C, 25, out_width);
SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_D, 25, out_width);
//--------------------RETURN IMAGE TO HOST-----------------cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);
//Free memory
cudaFree(d_A);
cudaFree(d_B);
//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}
/* ends main */
Tiled Only
***********************************************/
#include <cuda.h>
#include <stdio.h>
#define TILESIZE 32
extern "C"{
#include "cips.h"
}
__global__ void GaussConvolution(short *N, short *P, short *gaussmask, int mask_elements, int
width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + gaussmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x - 2];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + gaussmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x + 1];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc
acc
acc
acc
acc
}
=
=
=
=
=
acc
acc
acc
acc
acc
+
+
+
+
+
gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i
*
*
*
*
*
5
5
5
5
5
+
+
+
+
+
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
+
+
+
+
+
col col col];

col +
col +
2];
1];
1];
2];
}
//SATURATION
if (P[row * width + col] < 0)
}
else{
}
}
__global__ void SharpenConvolution(short *N, short *P, short *sharpenmask, int mask_elements,
int width){
__syncthreads();
//printf("thead Synch");
long acc = 0;
int i;
int j;
//NO HALO ELEMENTS
for (i = 0; i < 5; i++){
acc = acc + sharpenmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x 2];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + sharpenmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x +
1];
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow)* width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col - 1];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col];
acc = acc + sharpenmask[i * 5 + 3] * N[(Nrow)* width + col + 1];

acc = acc + sharpenmask[i * 5 + 4] * N[(Nrow)* width + col + 2];
}
}
P[row * width + col] = (short)(acc / 2);
//SATURATION
}
else{
P[row * width + col] = N[row * width + col];
}
}
{
char response[80];
int sat;
/******************************************
*
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
"\n"
exit(0);
}
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
exit(0);
}
/******************************************
*
*
******************************************/
}

//Input image
short *d_C; //Gaussian Mask
short *d_D; //Sharpen Mask
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
short *out_image2;
short *the_image2;
int m;
int n;
}
}
cudaMalloc((void**)&d_D, 25*sizeof(short));
cudaMalloc((void**)&d_C, 25*sizeof(short));
//Copying to device
cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_D, sharpen, 25 * sizeof(short), cudaMemcpyHostToDevice);
//128, 128 (Apollo) - 64, 64 (Lena)
//-----------------------APPLY FILTER ----------------------GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_C, 25, out_width);
SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_D, 25, out_width);

//Free memory
cudaFree(d_A);
cudaFree(d_B);
int L;
}
}
cudaProfilerStop();
return 0;
}
/* ends main */
Constant Memory only

/***********************************************/
#include
#include
#include
#include
#include
<cuda.h>
<cuda_runtime.h>
"device_launch_parameters.h"
<stdio.h>
<cuda_profiler_api.h>
#define TILESIZE 32
#define MASK_WIDTH 25
__constant__ short gaussmask[MASK_WIDTH];
__constant__ short sharpenmask[MASK_WIDTH];
extern "C"{
#include "cips.h"
}
__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
long acc = 0;
int i;
for (i = 0; i < 5; i++){
acc = acc + gaussmask[i * 5 + 0] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col];
acc = acc + gaussmask[i * 5 + 3] * N[(Nrow) * width + col +
acc = acc + gaussmask[i * 5 + 4] * N[(Nrow) * width + col +
}
//SATURATION
}
else{
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){
2];
1];
1];
2];

long acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) &&
for (i = 0; i < 5; i++){
acc = acc + sharpenmask[i * 5 +
}
//SATURATION
}
else{
}
(col < (width - 3))){
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
}
{
char response[80];
int sat;
/******************************************
*
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
"\n"
exit(0);
}
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
+
+
+
+
+
col col col];

col +
col +
2];
1];
1];
2];
exit(0);
}
/******************************************
*
*
******************************************/
}

//Input image
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
short *out_image2;
short *the_image2;
int m;
int n;
}
}
//cudaMalloc((void**)&d_C, 25*sizeof(short));
//Copying to device
//cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gaussmask, gauss, 25 * sizeof(short));
cudaMemcpyToSymbol(sharpenmask, sharpen, 25 * sizeof(short));
//-----------------------DEFINING DIMENSIONS-----------------------
dim3 dimGrid(16, 16, 1);

dim3 dimBlock(32, 32, 1);
//128, 128 (Apollo) - 64, 64 (Lena)

//32, 32 (Apollo) - 8, 8 (Lena)
//-----------------------APPLY FILTER ----------------------SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);

//GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
//Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(gaussmask);
int L;
}
}
cudaProfilerStop();
return 0;
}
/* ends main */
Tiling + Constant Memory

/***********************************************/
#include <cuda.h>
#include <stdio.h>
#define TILESIZE 32
#define MASK_WIDTH 25
__constant__ short gaussmask[MASK_WIDTH];
__constant__ short sharpenmask[MASK_WIDTH];
extern "C"{
#include "cips.h"
}
__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
__syncthreads();
long acc = 0;
int i;
int j;
//NO HALO ELEMENTS
for (i = 0; i < 5; i++){
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
}
}
//HALO ELEMENTS
else {
//for (i =
for
int
acc
acc
acc
acc
acc
}
0; i < mask_elements;
(i = 0; i < 5; i++){
Nrow = row - 2 + i;
= acc + gaussmask[i *
i++){
5
5
5
5
5
+
+
+
+
+
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
+
+
+
+
+
col col col];

col +
col +
2];
1];
1];
2];
}
//SATURATION
}
else{
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){
__syncthreads();
long acc = 0;
int i;
int j;
//NO HALO ELEMENTS
for (i = 0; i < 5; i++){
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
1];
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow) * width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col - 1];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col];

acc = acc + sharpenmask[i * 5 + 3] * N[(Nrow) * width + col + 1];
acc = acc + sharpenmask[i * 5 + 4] * N[(Nrow) * width + col + 2];
}
}
//SATURATION
}
else{
}
}
{
char response[80];
int sat;
/******************************************
*
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
"\n"
exit(0);
}
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
exit(0);
}
/******************************************
*
*
******************************************/
}

//Input image
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
short *out_image2;
short *the_image2;
int m;
int n;
}
}
//cudaMalloc((void**)&d_C, 25*sizeof(short));
//Copying to device
//cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gaussmask, gauss, 25 * sizeof(short));
cudaMemcpyToSymbol(sharpenmask, sharpen, 25 * sizeof(short));
//128, 128 (Apollo) - 16, 16 (Lena)
//-----------------------APPLY FILTER ----------------------SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
//--------------------RETURN IMAGE TO HOST------------------
cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);

//Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(gaussmask);
int L;
}
}
cudaProfilerStop();
return 0;
}
/* ends main */

CUDA Comparison

Загружено:

Сведения о документе

Оригинальное название

Авторское право

Доступные форматы

Поделиться этим документом

Поделиться или встроить документ

Параметры публикации

Этот документ был вам полезен?

Это неприемлемый материал?

Авторское право:

Доступные форматы

CUDA Comparison

Загружено:

Авторское право:

Доступные форматы

Helder Eiki Oshiro

It was made 5 versions of the program:

The sequential, processed by CPU

Tiling + cnst memory

Const memory only

Figure 1 Timing for each version, measured in microseconds

The results were plotted in the graph below.

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;

col col col];

acc = acc + sharpenmask[i * 5 + 3] * N[(Nrow)* width + col + 1];

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;

//--------------------RETURN IMAGE TO HOST-----------------cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);

Constant Memory only

int row = TILESIZE * blockIdx.y + threadIdx.y;

(col < (width - 3))){

col col col];

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;

dim3 dimGrid(16, 16, 1);

//128, 128 (Apollo) - 64, 64 (Lena)

//-----------------------APPLY FILTER ----------------------SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);

Tiling + Constant Memory

col col col];

acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col];

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;

cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);

Вам также может понравиться