Вы находитесь на странице: 1из 23

Helder Eiki Oshiro

80568704

MIDTERM 1

It was made 5 versions of the program:

The sequential, processed by CPU


Simple kernel, using global memory and processed in GPU
Using constant memory to store the mask and global memory to store the image
Using shared memory to store the image
Using tiles (shared memory) and constant memory

The computation time can be seen on the tables above. The processing time was measured in microseconds,
and was timed just on the convolution function, not considering the reading/saving file time. It was used the
nvprof tool for measuring the kernel execution time, and gettimeofday() for the sequential code. In both case,
was used a computer equipped with CPU i5-6300 HQ and a GPU nVidia GTX960m.
CPU
Sharpen
Gaussian

Apollo
Lena
2,660,925
39,252
2,558,420
42,079

Tiled only
Sharpen
Gaussian

Apollo

Simple Kernel
Sharpen
Gaussian

Apollo

Tiling + cnst memory


Sharpen
Gaussian

Apollo

Const memory only


Sharpen
Gaussian

Apollo

Lena
20,861
20,860

385
384
Lena

18,465
18,462

294
291
Lena

10,550
10,578

166
164
Lena

6,533
6,526

Figure 1 Timing for each version, measured in microseconds

The results were plotted in the graph below.

102
100

APOLLO.BMP (MICROSECONDS)
Gaussian

TILED ONLY

SIMPLE

TILED + CONST
MEMORY

6,526

6,533

10,578

10,550

18,462

18,465

20,860

20,861

Sharpen

CONSTANT MEMORY
ONLY

LENA.BMP (MICROSECONDS)
Gaussian

TILED ONLY

SIMPLE

TILED + CONST
MEMORY

100

102

164

166

291

294

384

385

Sharpen

CONSTANT MEMORY
ONLY

As we can see, the version using constant memory only is the fastest, meanwhile the kernel using shared
memory is the slowest.

Analyzing this behavior we can presume that it does not worth the effort transferring the image from
global memory to shared memory due the overhead time spent. Each element to be computed requires 25
(number of elements in the mask) memory access in the image matrix and 25 access into the mask matrix. In
comparison to the multiplication kernel, which has the performance very improved by using tiling technique, each
element to be computed needed number_of_collumns + number_of_rows memory access.
Another possible explanation for the constant memory version be the fastest is that, since each thread
uses many in common elements between its neighbors, some elements might be loaded into the cache memory,
compensating the no use of shared memory. Furthermore it does not spend time loading into shared memory.

SEQUENTIAL CODE
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include "cips.h"
main (argc, argv)
int argc;
char *argv[];
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf ("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit (0);
}
strcpy (name, argv[1]);
strcpy (name2, argv[2]);
out_length = atoi (argv[3]);
out_width = atoi (argv[4]);
if (argc > 5) {
il = atoi (argv[5]);
ie = atoi (argv[6]);
}
if (does_not_exist (name)) {
printf ("\nERROR input file %s does not exist", name);
exit (0);
}
get_image_size (name, &in_length, &in_width);
the_image = (short **) allocate_image_array (in_length, in_width);
read_image_array (name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/

if (is_a_bmp (name)) {
read_bmp_file_header (name, &bmp_file_header);
read_bm_header (name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file (name2, &bmp_file_header, &bmheader);
}
out_image = (short **) allocate_image_array (out_length, out_width);
/******************************************
*
* Copy the input image array to the output
* image array per the input parameters.
*
******************************************/
//int mask[5][5] = { { -1, -1, -1, -1, -1 }, { -1, 2, 2, 2, -1 }, { -1, 2, 8, 2, -1 }, { -1,
2, 2, 2, -1 }, { -1, -1, -1, -1, -1 } }; //Sharpen
int mask[5][5] = { { 1, 4, 6, 4, 1 }, { 4, 16, 24, 16, 4 }, { 6, 24, 36, 24, 6 }, { 4, 16, 24,
16, 4 }, { 1, 4, 6, 4, 1 } };
//Gaussian
int m, n;
struct timeval start, end;
gettimeofday(&start, NULL);
for (i = 2; i < out_length - 3; i++)
for (j = 2; j < out_width - 3; j++){
int acc = 0 ;
for (m = 0; m < 5; m++){
for (n = 0; n < 5; n++){
acc += (float) mask[m][n] * the_image[i - 2 + m][j + - 2 +n];
}
}
acc = acc / 256;
if (acc > 255) acc = 255;
out_image[i][j] = acc;
}
gettimeofday(&end, NULL);
unsigned long endtime_in_micros = 1000000 * end.tv_sec + end.tv_usec;
unsigned long starttime_in_micros = 1000000 * start.tv_sec + start.tv_usec;
printf("%ld\n", ((endtime_in_micros) - (starttime_in_micros)));
write_image_array (name2, out_image);
free_image_array (out_image, out_length);
free_image_array (the_image, in_length);
}
/* ends main */

SIMPLE KERNEL
***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);

__global__ void GaussConvolution(short *N, short *P, short *Mask, int mask_elements, int width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
P[row * width + col] = (short)(acc/256);
if (P[row * width + col] > 255)
P[row * width + col] = 255;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, short *Mask, int mask_elements, int
width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
P[row * width + col] = (short)(acc/2);
if (P[row * width + col] > 255)
P[row * width + col] = 255;

}
else{
P[row * width + col] = 100;
}
}
__constant__ short sharpen_ptr[25];
//__constant__ short gauss_ptr;
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);

read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;


//Input image
short *d_B; //Output image
short *d_C; //Gaussian Mask
short *d_D; //Sharpen Mask
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
//Allocating memory to receive output on host
int sizeoutput = out_length * out_width * sizeof(short);
int sizeinput = out_length * out_width * sizeof(short);
short *out_image2;
out_image2 = (short*)malloc(sizeoutput);
//Converting image into a 1D array
short *the_image2;
the_image2 = (short*)malloc(sizeinput);
int m;
int n;
for (m = 0; m < out_width; m++){
for (n = 0; n < out_width; n++){
the_image2[m * out_width + n] = the_image[m][n];
}
}
//Allocating memory in device
//cudaMemcpyToSymbol(gauss_ptr, gauss, 25*sizeof(short));
cudaMalloc((void**)&d_D, 25*sizeof(short));
cudaMalloc((void**)&d_C, 25*sizeof(short));
cudaMalloc((void**)&d_A, sizeinput);
cudaMalloc((void**)&d_B, sizeoutput);
//Copying to device
cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_D, sharpen, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_A, the_image2, sizeinput, cudaMemcpyHostToDevice);
//-----------------------DEFINING DIMENSIONS----------------------dim3 dimGrid(16, 16, 1);
//128, 128 (Apollo) - 16, 16 (Lena)
dim3 dimBlock(32, 32, 1); //32, 32 (Apollo) - 32, 32 (Lena)
//-----------------------APPLY FILTER ----------------------GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_C, 25, out_width);
SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_D, 25, out_width);
//--------------------RETURN IMAGE TO HOST-----------------cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);
//Free memory
cudaFree(d_A);
cudaFree(d_B);

//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}

/* ends main */

Tiled Only
***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
#define TILESIZE 32
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);

__global__ void GaussConvolution(short *N, short *P, short *gaussmask, int mask_elements, int
width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + gaussmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x - 2];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x - 1];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + gaussmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x + 1];
acc = acc + gaussmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x + 2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;

acc
acc
acc
acc
acc
}

=
=
=
=
=

acc
acc
acc
acc
acc

+
+
+
+
+

gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i

*
*
*
*
*

5
5
5
5
5

+
+
+
+
+

0]
1]
1]
3]
4]

*
*
*
*
*

N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)

*
*
*
*
*

width
width
width
width
width

+
+
+
+
+

col col col];


col +
col +

2];
1];
1];
2];

}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, short *sharpenmask, int mask_elements,
int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
//printf("thead Synch");
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + sharpenmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x 2];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x 1];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + sharpenmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x +
1];
acc = acc + sharpenmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x +
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow)* width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col - 1];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col];

acc = acc + sharpenmask[i * 5 + 3] * N[(Nrow)* width + col + 1];


acc = acc + sharpenmask[i * 5 + 4] * N[(Nrow)* width + col + 2];
}
}
P[row * width + col] = (short)(acc / 2);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = N[row * width + col];
}
}
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************

*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;


//Input image
short *d_B; //Output image
short *d_C; //Gaussian Mask
short *d_D; //Sharpen Mask
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
//Allocating memory to receive output on host
int sizeoutput = out_length * out_width * sizeof(short);
int sizeinput = out_length * out_width * sizeof(short);
short *out_image2;
out_image2 = (short*)malloc(sizeoutput);
//Converting image into a 1D array
short *the_image2;
the_image2 = (short*)malloc(sizeinput);
int m;
int n;
for (m = 0; m < out_width; m++){
for (n = 0; n < out_width; n++){
the_image2[m * out_width + n] = the_image[m][n];
}
}
//Allocating memory in device
cudaMalloc((void**)&d_D, 25*sizeof(short));
cudaMalloc((void**)&d_C, 25*sizeof(short));
cudaMalloc((void**)&d_A, sizeinput);
cudaMalloc((void**)&d_B, sizeoutput);
//Copying to device
cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_D, sharpen, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpy(d_A, the_image2, sizeinput, cudaMemcpyHostToDevice);
//-----------------------DEFINING DIMENSIONS----------------------dim3 dimGrid(16, 16, 1);
//128, 128 (Apollo) - 64, 64 (Lena)
dim3 dimBlock(32, 32, 1); //32, 32 (Apollo) - 8, 8 (Lena)
//-----------------------APPLY FILTER ----------------------GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_C, 25, out_width);
SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, d_D, 25, out_width);

//--------------------RETURN IMAGE TO HOST-----------------cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);


//Free memory
cudaFree(d_A);
cudaFree(d_B);
//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}

/* ends main */

Constant Memory only


/***********************************************/
#include
#include
#include
#include
#include

<cuda.h>
<cuda_runtime.h>
"device_launch_parameters.h"
<stdio.h>
<cuda_profiler_api.h>

#define TILESIZE 32
#define MASK_WIDTH 25
__constant__ short gaussmask[MASK_WIDTH];
__constant__ short sharpenmask[MASK_WIDTH];
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);

__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
long acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + gaussmask[i * 5 + 0] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col];
acc = acc + gaussmask[i * 5 + 3] * N[(Nrow) * width + col +
acc = acc + gaussmask[i * 5 + 4] * N[(Nrow) * width + col +
}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = N[row * width + col];
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){

2];
1];
1];
2];

int row = TILESIZE * blockIdx.y + threadIdx.y;


int col = TILESIZE * blockIdx.x + threadIdx.x;
long acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) &&
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + sharpenmask[i * 5 +
acc = acc + sharpenmask[i * 5 +
acc = acc + sharpenmask[i * 5 +
acc = acc + sharpenmask[i * 5 +
acc = acc + sharpenmask[i * 5 +
}
P[row * width + col] = (short)(acc/2);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = N[row * width + col];
}

(col < (width - 3))){

0]
1]
1]
3]
4]

*
*
*
*
*

N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)

*
*
*
*
*

width
width
width
width
width

}
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}

+
+
+
+
+

col col col];


col +
col +

2];
1];
1];
2];

if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;


//Input image
short *d_B; //Output image
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
//Allocating memory to receive output on host
int sizeoutput = out_length * out_width * sizeof(short);
int sizeinput = out_length * out_width * sizeof(short);
short *out_image2;
out_image2 = (short*)malloc(sizeoutput);
//Converting image into a 1D array
short *the_image2;
the_image2 = (short*)malloc(sizeinput);
int m;
int n;
for (m = 0; m < out_width; m++){
for (n = 0; n < out_width; n++){
the_image2[m * out_width + n] = the_image[m][n];
}
}
//Allocating memory in device
//cudaMalloc((void**)&d_C, 25*sizeof(short));
cudaMalloc((void**)&d_A, sizeinput);
cudaMalloc((void**)&d_B, sizeoutput);
//Copying to device
//cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gaussmask, gauss, 25 * sizeof(short));
cudaMemcpyToSymbol(sharpenmask, sharpen, 25 * sizeof(short));
cudaMemcpy(d_A, the_image2, sizeinput, cudaMemcpyHostToDevice);
//-----------------------DEFINING DIMENSIONS-----------------------

dim3 dimGrid(16, 16, 1);


dim3 dimBlock(32, 32, 1);

//128, 128 (Apollo) - 64, 64 (Lena)


//32, 32 (Apollo) - 8, 8 (Lena)

//-----------------------APPLY FILTER ----------------------SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);


//GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
//--------------------RETURN IMAGE TO HOST-----------------cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);
//Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(gaussmask);
//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}

/* ends main */

Tiling + Constant Memory


/***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
#define TILESIZE 32
#define MASK_WIDTH 25
__constant__ short gaussmask[MASK_WIDTH];
__constant__ short sharpenmask[MASK_WIDTH];
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);

__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + gaussmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x - 2];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x - 1];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + gaussmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x + 1];
acc = acc + gaussmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x + 2];
}
}
//HALO ELEMENTS
else {

//for (i =
for
int
acc
acc
acc
acc
acc
}

0; i < mask_elements;
(i = 0; i < 5; i++){
Nrow = row - 2 + i;
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *

i++){

5
5
5
5
5

+
+
+
+
+

0]
1]
1]
3]
4]

*
*
*
*
*

N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)

*
*
*
*
*

width
width
width
width
width

+
+
+
+
+

col col col];


col +
col +

2];
1];
1];
2];

}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + sharpenmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x 2];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x 1];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + sharpenmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x +
1];
acc = acc + sharpenmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x +
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow) * width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col - 1];

acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col];


acc = acc + sharpenmask[i * 5 + 3] * N[(Nrow) * width + col + 1];
acc = acc + sharpenmask[i * 5 + 4] * N[(Nrow) * width + col + 2];
}
}
P[row * width + col] = (short)(acc/2);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = 100;
}
}
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);

/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);

//-----------CREATING POINTERS AND ALLOCATING INTO DEVICE MEMORY----------------short *d_A;


//Input image
short *d_B; //Output image
//Gauss mask
short gauss[25] = { 1, 4, 6, 4, 1, 4, 16, 24, 16, 4, 6, 24, 36, 24, 6, 4, 16, 24, 16, 4,
1, 4, 6, 4, 1 };
short sharpen[25] = { -1, -1, -1, -1, -1,
-1, 2, 2, 2, -1,
-1, 2, 8, 2, -1,
-1,
2, 2, 2, -1,
-1, -1, -1, -1, -1};
//Allocating memory to receive output on host
int sizeoutput = out_length * out_width * sizeof(short);
int sizeinput = out_length * out_width * sizeof(short);
short *out_image2;
out_image2 = (short*)malloc(sizeoutput);
//Converting image into a 1D array
short *the_image2;
the_image2 = (short*)malloc(sizeinput);
int m;
int n;
for (m = 0; m < out_width; m++){
for (n = 0; n < out_width; n++){
the_image2[m * out_width + n] = the_image[m][n];
}
}
//Allocating memory in device
//cudaMalloc((void**)&d_C, 25*sizeof(short));
cudaMalloc((void**)&d_A, sizeinput);
cudaMalloc((void**)&d_B, sizeoutput);
//Copying to device
//cudaMemcpy(d_C, gauss, 25 * sizeof(short), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gaussmask, gauss, 25 * sizeof(short));
cudaMemcpyToSymbol(sharpenmask, sharpen, 25 * sizeof(short));
cudaMemcpy(d_A, the_image2, sizeinput, cudaMemcpyHostToDevice);
//-----------------------DEFINING DIMENSIONS----------------------dim3 dimGrid(16, 16, 1);
//128, 128 (Apollo) - 16, 16 (Lena)
dim3 dimBlock(32, 32, 1); //32, 32 (Apollo) - 32, 32 (Lena)
//-----------------------APPLY FILTER ----------------------SharpenConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
GaussConvolution <<<dimGrid, dimBlock>>>(d_A, d_B, 25, out_width);
//--------------------RETURN IMAGE TO HOST------------------

cudaMemcpy(out_image2, d_B, sizeoutput, cudaMemcpyDeviceToHost);


//Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(gaussmask);
//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}

/* ends main */

Вам также может понравиться