Академический Документы
Профессиональный Документы
Культура Документы
80568704
MIDTERM 1
The computation time can be seen on the tables above. The processing time was measured in microseconds,
and was timed just on the convolution function, not considering the reading/saving file time. It was used the
nvprof tool for measuring the kernel execution time, and gettimeofday() for the sequential code. In both case,
was used a computer equipped with CPU i5-6300 HQ and a GPU nVidia GTX960m.
CPU
Sharpen
Gaussian
Apollo
Lena
2,660,925
39,252
2,558,420
42,079
Tiled only
Sharpen
Gaussian
Apollo
Simple Kernel
Sharpen
Gaussian
Apollo
Apollo
Apollo
Lena
20,861
20,860
385
384
Lena
18,465
18,462
294
291
Lena
10,550
10,578
166
164
Lena
6,533
6,526
102
100
APOLLO.BMP (MICROSECONDS)
Gaussian
TILED ONLY
SIMPLE
TILED + CONST
MEMORY
6,526
6,533
10,578
10,550
18,462
18,465
20,860
20,861
Sharpen
CONSTANT MEMORY
ONLY
LENA.BMP (MICROSECONDS)
Gaussian
TILED ONLY
SIMPLE
TILED + CONST
MEMORY
100
102
164
166
291
294
384
385
Sharpen
CONSTANT MEMORY
ONLY
As we can see, the version using constant memory only is the fastest, meanwhile the kernel using shared
memory is the slowest.
Analyzing this behavior we can presume that it does not worth the effort transferring the image from
global memory to shared memory due the overhead time spent. Each element to be computed requires 25
(number of elements in the mask) memory access in the image matrix and 25 access into the mask matrix. In
comparison to the multiplication kernel, which has the performance very improved by using tiling technique, each
element to be computed needed number_of_collumns + number_of_rows memory access.
Another possible explanation for the constant memory version be the fastest is that, since each thread
uses many in common elements between its neighbors, some elements might be loaded into the cache memory,
compensating the no use of shared memory. Furthermore it does not spend time loading into shared memory.
SEQUENTIAL CODE
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include "cips.h"
main (argc, argv)
int argc;
char *argv[];
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf ("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit (0);
}
strcpy (name, argv[1]);
strcpy (name2, argv[2]);
out_length = atoi (argv[3]);
out_width = atoi (argv[4]);
if (argc > 5) {
il = atoi (argv[5]);
ie = atoi (argv[6]);
}
if (does_not_exist (name)) {
printf ("\nERROR input file %s does not exist", name);
exit (0);
}
get_image_size (name, &in_length, &in_width);
the_image = (short **) allocate_image_array (in_length, in_width);
read_image_array (name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp (name)) {
read_bmp_file_header (name, &bmp_file_header);
read_bm_header (name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file (name2, &bmp_file_header, &bmheader);
}
out_image = (short **) allocate_image_array (out_length, out_width);
/******************************************
*
* Copy the input image array to the output
* image array per the input parameters.
*
******************************************/
//int mask[5][5] = { { -1, -1, -1, -1, -1 }, { -1, 2, 2, 2, -1 }, { -1, 2, 8, 2, -1 }, { -1,
2, 2, 2, -1 }, { -1, -1, -1, -1, -1 } }; //Sharpen
int mask[5][5] = { { 1, 4, 6, 4, 1 }, { 4, 16, 24, 16, 4 }, { 6, 24, 36, 24, 6 }, { 4, 16, 24,
16, 4 }, { 1, 4, 6, 4, 1 } };
//Gaussian
int m, n;
struct timeval start, end;
gettimeofday(&start, NULL);
for (i = 2; i < out_length - 3; i++)
for (j = 2; j < out_width - 3; j++){
int acc = 0 ;
for (m = 0; m < 5; m++){
for (n = 0; n < 5; n++){
acc += (float) mask[m][n] * the_image[i - 2 + m][j + - 2 +n];
}
}
acc = acc / 256;
if (acc > 255) acc = 255;
out_image[i][j] = acc;
}
gettimeofday(&end, NULL);
unsigned long endtime_in_micros = 1000000 * end.tv_sec + end.tv_usec;
unsigned long starttime_in_micros = 1000000 * start.tv_sec + start.tv_usec;
printf("%ld\n", ((endtime_in_micros) - (starttime_in_micros)));
write_image_array (name2, out_image);
free_image_array (out_image, out_length);
free_image_array (the_image, in_length);
}
/* ends main */
SIMPLE KERNEL
***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);
__global__ void GaussConvolution(short *N, short *P, short *Mask, int mask_elements, int width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
P[row * width + col] = (short)(acc/256);
if (P[row * width + col] > 255)
P[row * width + col] = 255;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, short *Mask, int mask_elements, int
width){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
short acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < mask_elements; i++){
acc = acc + Mask[i] * N[((col - 2) + i % 5) + (row - 2) * width + width*(i
/ 5)];
}
P[row * width + col] = (short)(acc/2);
if (P[row * width + col] > 255)
P[row * width + col] = 255;
}
else{
P[row * width + col] = 100;
}
}
__constant__ short sharpen_ptr[25];
//__constant__ short gauss_ptr;
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);
//--------------------CONVERTING TO 2D--------------------int k;
int L;
for (k = 0; k < out_width; k++){
for (L = 0; L < out_width; L++){
out_image[k][L] = out_image2[k * out_width + L];
}
}
//-------------------WRITE INTO FILE---------------------write_image_array(name2, out_image);
free_image_array(out_image, out_length);
free_image_array(the_image, in_length);
cudaProfilerStop();
return 0;
}
/* ends main */
Tiled Only
***********************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda_profiler_api.h>
#define TILESIZE 32
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);
__global__ void GaussConvolution(short *N, short *P, short *gaussmask, int mask_elements, int
width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + gaussmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x - 2];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x - 1];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + gaussmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x + 1];
acc = acc + gaussmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x + 2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc
acc
acc
acc
acc
}
=
=
=
=
=
acc
acc
acc
acc
acc
+
+
+
+
+
gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i
gaussmask[i
*
*
*
*
*
5
5
5
5
5
+
+
+
+
+
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
+
+
+
+
+
2];
1];
1];
2];
}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, short *sharpenmask, int mask_elements,
int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
//printf("thead Synch");
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + sharpenmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x 2];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x 1];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + sharpenmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x +
1];
acc = acc + sharpenmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x +
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow)* width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col - 1];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow)* width + col];
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);
/* ends main */
<cuda.h>
<cuda_runtime.h>
"device_launch_parameters.h"
<stdio.h>
<cuda_profiler_api.h>
#define TILESIZE 32
#define MASK_WIDTH 25
__constant__ short gaussmask[MASK_WIDTH];
__constant__ short sharpenmask[MASK_WIDTH];
extern "C"{
#include "cips.h"
}
extern "C" int does_not_exist(char file_name[]);
extern "C" int read_image_array(char *file_name, short **array);
extern "C" int get_image_size(char *file_name, int *rows, int *cols);
extern "C" int is_a_bmp(char *file_name);
extern "C" void read_bmp_file_header(char *file_name, struct bmpfileheader *file_header);
extern "C" void read_bm_header(char *file_name, struct bitmapheader *bmheader);
extern "C" void create_allocate_bmp_file(char *file_name, struct bmpfileheader *file_header,
struct bitmapheader *bmheader);
extern "C" int write_image_array(char *file_name, short **array);
extern "C" int free_image_array(short **the_array, int length);
__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
long acc = 0;
int i;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + gaussmask[i * 5 + 0] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col acc = acc + gaussmask[i * 5 + 1] * N[(Nrow) * width + col];
acc = acc + gaussmask[i * 5 + 3] * N[(Nrow) * width + col +
acc = acc + gaussmask[i * 5 + 4] * N[(Nrow) * width + col +
}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = N[row * width + col];
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){
2];
1];
1];
2];
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
}
int main(int argc, char *argv[])
{
char response[80];
char name[80], name2[80];
int i = 0, ie = 0, il = 0, j = 0, in_length = 0, out_length =
0, in_width = 0, out_width = 0;
short **the_image, **out_image;
struct bmpfileheader bmp_file_header;
struct bitmapheader bmheader;
int sat;
/******************************************
*
* Ensure the command line is correct.
*
******************************************/
if (argc < 5 || (argc > 5 && argc < 7)) {
printf("\nusage: roundoff in-image out-image"
" length width [il ie]"
"\n"
"\n If you do not specify il ie"
" they will be set to 1 1."
"\n ll le will always be" " il+length and ie+width" "\n");
exit(0);
}
strcpy(name, argv[1]);
strcpy(name2, argv[2]);
out_length = atoi(argv[3]);
out_width = atoi(argv[4]);
if (argc > 5) {
il = atoi(argv[5]);
ie = atoi(argv[6]);
}
+
+
+
+
+
2];
1];
1];
2];
if (does_not_exist(name)) {
printf("\nERROR input file %s does not exist", name);
exit(0);
}
get_image_size(name, &in_length, &in_width);
the_image = (short **)allocate_image_array(in_length, in_width);
read_image_array(name, the_image);
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);
/* ends main */
__global__ void GaussConvolution(short *N, short *P, int mask_elements, int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + gaussmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x - 2];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x - 1];
acc = acc + gaussmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + gaussmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x + 1];
acc = acc + gaussmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x + 2];
}
}
//HALO ELEMENTS
else {
//for (i =
for
int
acc
acc
acc
acc
acc
}
0; i < mask_elements;
(i = 0; i < 5; i++){
Nrow = row - 2 + i;
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *
= acc + gaussmask[i *
i++){
5
5
5
5
5
+
+
+
+
+
0]
1]
1]
3]
4]
*
*
*
*
*
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
N[(Nrow)
*
*
*
*
*
width
width
width
width
width
+
+
+
+
+
2];
1];
1];
2];
}
P[row * width + col] = (short)(acc/256);
//SATURATION
if (P[row * width + col] > 255)
P[row * width + col] = 255;
if (P[row * width + col] < 0)
P[row * width + col] = 0;
}
else{
P[row * width + col] = 100;
}
}
__global__ void SharpenConvolution(short *N, short *P, int mask_elements, int width){
__shared__ short N_shared[TILESIZE][TILESIZE];
int row = TILESIZE * blockIdx.y + threadIdx.y;
int col = TILESIZE * blockIdx.x + threadIdx.x;
N_shared[threadIdx.y][threadIdx.x] = N[row * width + col];
__syncthreads();
long acc = 0;
int i;
int j;
if ((row > 2) && (col > 2) && (row < (width - 3)) && (col < (width - 3))){
//NO HALO ELEMENTS
if (threadIdx.x > 2 && threadIdx.y > 2 && threadIdx.x < (TILESIZE - 3) &&
threadIdx.y < (TILESIZE - 3)){
for (i = 0; i < 5; i++){
int Nrow = threadIdx.y - 2 + i;
//This index will probably be
stored in a register, and doing a single loop computing 5 elements by iteration recudes the
number of times this index is being computed
acc = acc + sharpenmask[i * 5 + 0] * N_shared[Nrow][threadIdx.x 2];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x 1];
acc = acc + sharpenmask[i * 5 + 1] * N_shared[Nrow][threadIdx.x];
acc = acc + sharpenmask[i * 5 + 3] * N_shared[Nrow][threadIdx.x +
1];
acc = acc + sharpenmask[i * 5 + 4] * N_shared[Nrow][threadIdx.x +
2];
}
}
//HALO ELEMENTS
else {
for (i = 0; i < 5; i++){
int Nrow = row - 2 + i;
acc = acc + sharpenmask[i * 5 + 0] * N[(Nrow) * width + col - 2];
acc = acc + sharpenmask[i * 5 + 1] * N[(Nrow) * width + col - 1];
/******************************************
*
* Create the output image and allocate
* the output image array.
*
******************************************/
if (is_a_bmp(name)) {
read_bmp_file_header(name, &bmp_file_header);
read_bm_header(name, &bmheader);
bmheader.height = out_length;
bmheader.width = out_width;
create_allocate_bmp_file(name2, &bmp_file_header, &bmheader);
}
out_image = (short **)allocate_image_array(out_length, out_width);
/* ends main */