You are on page 1of 30

-

:

:

___ 2011 .

______________________( ..)
()

230100


MPI, OpenCL OpenMP
________________________ ( . . )
()

( ..)

__________________ ( . .)
()

( ..)

2011 .

1.

2.

2.1 -

2.2 -....

2.3 ..

2.4

3. -..............

4. -

11

5. ...........

12

6. ............

13

1. ..
2. .............................................................

1.

, . ,
20 300
. .
,
.
,
,
.
,
.
, ,
, .
.

, .
,
.

2.
, .
2.1 -

, - [5].

cn n, [0,
2]:
( x,y,t )= c n cos ( u n x+v n yw n t+ n ) .

(1)

- , (x,y,t)
,
2

2S ( u,v ) du dv= c n .
n

-
, .
- , (1)
. , -
,
, (1).
(
) -
.
2.2 -
-
OpenCL, OpenMP MPI,
.
- ,
[5]
S ( ) =Ak exp [ Bn ] Q ( , ) ,
Q(, ) . A, B, k, n
,
1964 [5]
A= 0 . 28 ( 2 ) 4 h 2 4 ,B=0 . 44 ( 2 ) 4 4 ,k=5, n= 4 ,
h, t . ,
, cn

c n= 2

n+1

S ( ) d .

(1).

2.3
[6].
-
,
( ). ,


[1].
p1

p2

p3

z x,y,t = i,j,kz xi,y j,t k + x,y,t ,

(2)

i=0 j=0 k= 0

,
-. ,
,
A = b, ai,j = |x(i)-x(j)|,|y(i)-y(j)|,|t(i)-t(j)|, bi = x(i),y(i),t(i)
x(i) = mod((i+1)/(p1p2), p1) , y(i) = mod((i+1)/p3, p2) , t(i) = mod(i+1, p3) .
,
(0.1-0.7) (0.1-0.52) [6].
, .
z=f(y) F(z)

F(z) = (y),

(3)

(y) .
.

- [5, 9],

K z ( x,y,t )= C 2m
m= 0

K y ( x,y,t )
1
y2
, C m=
f
(
y
)

H
(
y
)exp(
) dy ,

m
m!
2
2

(4)

f(y) (3) Hm(y) . Cm


.
2.4

OpenCL, OpenMP MPI,
. , ,

. -,
, [1, 6],
.
p .
, ,
i i .
.
(x, y, t) = e- (|x|+|y|+|t|)cos(x)cos(y)cos(t)
, ,
.
.
(z) = F(z) - 2T(z, ) ,
T(z, ) , F(z) .
=1.2 . 1.

. 1. (x) =1.2, . 0.6130,


0.6243, 0.2004, 0.1026.

(3)
, ,
,
().
N

z=f ( y ) d i yi .
i=0

12 500
[-5z, 5z].
, ,
.
;
10-5 0.4310-3.

(4),
[5, 9]
2z =

C 2m
.
m!


-. ,
, . -
,
.

( ).
(Mersenne Twister) 219937-1.
, dcmt.

[7, 9].

. 2.


.
, .
,
. ,
.
( ) [2].

z x,y,t = i,j,k z xi,y j,z k + i,j,k z x+i,y+j,z+k + x,y,t .
i

3.
-
- :
OpenCL, OpenMP MPI,
, .

.
,
.

.
, .

(FPU), . ,
2 1
,
100 ( ) [8, 11].

OpenCL .

(2).
( ) ,
. ( ,
)
. 3. 2 1-2 OpenCL
.

. 3. () ()
() (b) .

OpenCL
.
() ,
[8]. ,

( 2 ).
,
.
. OpenCL
, OpenMP . MPI
;
, ,
. .

. 4. , ( ).

,
- ,
. ,
.

:
1) gcc (SUSE Linux) 4.3.2 [gcc-4_3-branch revision 141291] OpenMP,
MPI
2) llvm version 2.6svn (Built Jan 18 2011(17:37:10)) OpenCL.
3) NVIDIA version 260.19.36
4) MPICH version: 1.2.7 (release date 2005/06/22 16:33:49)
:
1) gnuplot 4.2 patchlevel 6
2) NVIDIA Compute Visual Profiler Version 3.2.0

4.
-
-
. : ,
, .
.
:
1) NVIDIA 8800GT, 14 :

8 CUDA (1,5 )
2
1 (FPU)
1

2) 10 :

2 x Intel Xeon E5335


InfiniBand, 2 x Gigabit Ethernet
8 Gb FB-DIMM DDR2
x86_64-suse-linux ( 2.6.32.12-0.7-default)

1-2
. 6-8. 1
. , -
, ,
.
, (1),
. OpenCL -
,

.
1. OpenCL, OpenMP MPI -
( ).

-
OpenCL OpenMP MPI OpenCL OpenMP MPI
400000

0.82

40,44 32,60

1,80

0,080 0,075

440000

0.90

44,59 35,78

1,92

0,100 0,093

480000

0.99

48,49 38,93

2,29

0,097 0,126

520000

1.07

52,65 41,92

2,43

0,118

0,117

560000

1.15

56,45 45,00

2,51

0,117

0,161

600000

1.23

60,85 48,80

2,54

0,123 0,132

640000

1.31

65,07 53,02

2,73

0,123 0,160

680000

1.40

68,90 54,92

2,80

0,138 0,136

720000

1.48

72,49 58,42

2,88

0,144

760000

1.56

76,86 61,41

3,47

0,156 0,155

800000

1.64

81,03 66,42

3,25

0,166

0,173
0,174

OpenCL -
50 (. 5).
,
OpenMP MPI
.

. 5. (AR) - (LH)
.


. -
,
.
OpenCL OpenMP, .
. 6.

. 6. (
) (AR) - (LH)
.

()
OpenMP MPI. ,
, (. 7).

.
OpenCL ,

(occupancy), ,
[12]. 2
.
, - ,
.
2. OpenCL

(occupancy)

(generate_water_surface,
connect_wave_surface_parts3)

0,167

(generate_water_surface_lh)

0,5

(oclMersenneTwister, NVIDIA
SDK)

0,667

(oclTranspose,
NVIDIA SDK)

1,0

. 7. ( )
(AR) - (LH)
.


.

(
OpenMP ),
.
,
, , 2%
(. 8).

. 8. , ,
.

5.
,
,
. -

. ,
MPI,
.

.
- ,
-. OpenCL
.
.
. -
( ),

.
,

.

6.
1. ., . . .: , 1974
2. .., ..
. //. 2003, .-, 2003, .2, .64-68
3. . ., . ., . . . .:
, 1977
4. ., . . .: , 1999
5. . ., . ., . .
. .: , 1978
6. . ., . . . .:
, 1990
7. Makoto Matsumoto, Takuji Nishimura: Dynamic Creation of Pseudorandom Generators
8. OpenCL Programming Guide for the CUDA Architecture (2010)
9. Podlozhnyuk V. Parallel Mersenne Twister (2007)
10. Ruud van der Pas: OpenMP and performance. In: International Workshop on OpenMP, Dresden
(2009)
11. Intel 64 and IA-32 Architectures Optimization Reference Manual (2011)
12. Compute Visual Profiler User Guide (2010)

1.

,
: ,
, , , , .
. 7-8.

. 7. .

. 8. -.

2.
1.
1. OpenCL ( ).
kernel
void generate_wave_surface(
global float* f_src,
//
size3 fsize,
// .
global float* z,
//
size3 zsize,
//
int part_size,
//
int interval,
//
local float* f,
//
local float* sum,
//
size3 offset)
{
const int part = get_group_id(0);
const int p = get_local_size(0);

const int u = get_local_id(0);


const int ip = 2;
const int iu = u%ip;
const int t0 = part*part_size;
const int t1 = t0 + part_size - interval;
//
event_t evt = async_work_group_copy(f, f_src, fsize.x*fsize.y*fsize.z, 0);
wait_group_events(1, &evt);
int maxs[3] = {zsize.x-1, zsize.y-1, t1-t0-1};
int maxd = maxs[0]+maxs[1]+maxs[2]+1;
//
for (int d=0; d<=maxd; ++d) {
int mi=min(d, maxs[0]);
int ni=max(0, mi-maxs[1]-maxs[2]);
for (int i=mi-u/ip; i>=ni; i-=p/ip) {
int mj=min(d-i, maxs[1]);
int nj=max(0, d-i-maxs[2]);
for (int j=mj; j>=nj; --j) {
int k=d-i-j;
k += t0;
int m1 = min(i+1, fsize.x);
int m2 = min(j+1, fsize.y);
int m3 = min(k+1, fsize.z);
sum[u] = 0;
// ( )
for (int x=iu; x<m1; x+=ip)
for (int y=0; y<m2; y++)
for (int t=0; t<m3; t++)

sum[u] += f[index(x, y, t, fsize)]*z[index(i-x, j-y, k-t, zsize)];


barrier(CLK_LOCAL_MEM_FENCE);
if (iu == 0) {
float s = 0;
for (uint c=0; c<ip; ++c)
s += sum[u+c];
z[index(i, j, k, zsize)] += s;
}
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
2. OpenCL ( ).
kernel
void connect_wave_surface_parts(
global float* f_src,
//
size3 fsize,
// .
global float* z,
//
size3 zsize,
//
int parSize,
//
int interval,
//
local float* f,
//
local float* sum,
//
size3 offset)
{
const int part = get_group_id(0), u = get_local_id(0), p = get_local_size(0);
const int ip = 2, iu = u%ip;
const int t0 = (part+1)*partSize - interval;

const int t1 = t0 + interval;


//
event_t evt = async_work_group_copy(f, f_src, fsize.x*fsize.y*fsize.z, 0);
wait_group_events(1, &evt);
int maxs[3] = {zsize.x-1, zsize.y-1, interval-1};
int maxd = maxs[0]+maxs[1]+maxs[2]+1;
for (int d=0; d<=maxd; ++d) {
int mi=min(d, maxs[0]);
int ni=max(0, mi-maxs[1]-maxs[2]);
for (int i=mi-u/ip; i>=ni; i-=p/ip) {
int mj=min(d-i, maxs[1]);
int nj=max(0, d-i-maxs[2]);
for (int j=mj; j>=nj; --j) {
int k=d-i-j;
k += t0;
int skip = t1-k+1;
int m1 = min(zsize.x-i, fsize.x);
int m2 = min(zsize.y-j, fsize.y);
int m3 = min(zsize.z-k, fsize.z)-skip;
sum[u] = 0;
for (int x=iu; x<m1; x+=ip)
for (int y=0; y<m2; y++)
for (int t=0; t<m3; t++)
sum[u] += f[index(x, y, t+skip, fsize)]*z[index(i+x, j+y, k+t+skip,
zsize)];
m1 = min(i+1, fsize.x);
m2 = min(j+1, fsize.y);
m3 = min(k+1, fsize.z);

for (int x=iu; x<m1; x+=ip)


for (int y=0; y<m2; y++)
for (int t=0; t<m3; t++)
sum[u] += f[index(x, y, t, fsize)]*z[index(i-x, j-y, k-t, zsize)];
barrier(CLK_LOCAL_MEM_FENCE);
if (iu == 0) {
float s = 0;
for (uint c=0; c<ip; ++c)
s += sum[u+c];
z[index(i, j, k, zsize)] += s;
}
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
3. OpenMP ( ).
void generate_water_surface(
const Discrete<Real>& f,
// .
const size3& fsize,
// - .
uint part_size,
//
uint interval,
//
const size3& zsize,
//
Discrete<Real>& Z)
//
{
const Index<3> id(fsize);
//
const Index<3> idz(zsize);
#pragma omp parallel shared(part_size, interval) num_threads(zsize[2]/part_size)
{
uint part = omp_get_thread_num();

uint t0 = part*part_size;
uint t1 = min(t0 + part_size - interval, zsize[2]);
for (uint x=0; x<zsize[0]; x++) {
for (uint y=0; y<zsize[1]; y++) {
for (uint t=t0; t<t1; t++) {
int m1 = min(x+1u, fsize[0]);
int m2 = min(y+1u, fsize[1]);
int m3 = min(t+1u, fsize[2]);
Real sum = 0;
for (int i=m1-1; i>=0; --i)
for (int j=m2-1; j>=0; --j)
for (int k=m3-1; k>=0; --k)
sum += f[f.size()-1-id(i, j, k)]*Z[idz(x-i, y-j, t-k)];
Z[idz(x, y, t)] += sum;
}
}
}
}
}
4. OpenMP ( ).
void connect_water_surface_parts(
const Discrete<Real>& f,
// .
const size3& fsize,
// - .
Discrete<Real>& z,
//
const size3& zsize,
//
uint part_size,
//
uint interval)
//
{
const Index<3> id(fsize);

const Index<3> idz(zsize);


#pragma omp parallel shared(part_size, interval) num_threads(zsize[2]/part_size)
{
uint part = omp_get_thread_num();
uint t0 = (part+1u)*part_size - interval;
uint t1 = min(t0 + interval, zsize[2]);
for (uint x=0; x<zsize[0]; x++) {
for (uint y=0; y<zsize[1]; y++) {
for (uint t=t0; t<t1; t++) {
Real sum = 0;
// calc sum right
int skip = t1 - t + 1;
int m1 = min(fsize[0], zsize[0]-x);
int m2 = min(fsize[1], zsize[1]-y);
int m3 = min(fsize[2], zsize[2]-t);
if (m3 > skip) {
m3 -= skip;
for (int i=0; i<m1; i++)
for (int j=0; j<m2; j++)
for (int k=0; k<m3; k++)
sum += f[f.size()-1-id(i, j, k + skip)]*z[idz(x+i, y+j, t+k +

skip)];
}

// calc left sum


m1 = min(x+1u, fsize[0]);
m2 = min(y+1u, fsize[1]);
m3 = min(t+1u, fsize[2]);
for (int i=m1-1; i>=0; --i)

for (int j=m2-1; j>=0; --j)


for (int k=m3-1; k>=0; --k)
sum += f[f.size()-1-id(i, j, k)]*z[idz(x-i, y-j, t-k)];
z[idz(x, y, t)] += sum;
}
}
}
}
}
5. MPI ( ).
void generate_water_surface_mpi(
const Discrete<Real>& f,
// .
const size3& fsize,
// - .
uint partSize,
//
uint interval,
//
const size3& zsize,
//
Discrete<Real>& z)
//
{
const Index_r<3> id(zsize);
const Index<3> idf(fsize);
for (int x=0; x<zsize[0]; x++) {
for (int y=0; y<zsize[1]; y++) {
for (int t=0; t<partSize-interval; t++) {
Real sum = 0.0f;
int m1 = min(x+1u, fsize[0]);
int m2 = min(y+1u, fsize[1]);
int m3 = min(t+1u, fsize[2]);
for (int i=m1-1; i>=0; --i)
for (int j=m2-1; j>=0; --j)

for (int k=m3-1; k>=0; --k)


sum += f[idf(i, j, k)]*z[id(x-i, y-j, t-k)];
z[id(x, y, t)] += sum;
}
}
}
}
6. MPI ( ).
void connect_water_surface_parts_mpi(
const Discrete<Real>& f,
// .
const size3& fsize,
// - .
Discrete<Real>& z,
//
const size3& zsize,
//
uint part_size,
//
uint interval)
//
{
uint t0 = part_size - interval;
uint t1 = part_size;
const Index_r<3> id(zsize);
const Index<3> idf(fsize);
for (uint x=0; x<zsize[0]; x++) {
for (uint y=0; y<zsize[1]; y++) {
for (uint t=t0; t<t1; t++) {
Real sum = 0.0f;

int skip = t1-t;


int m1 = min(fsize[0], zsize[0]-x);
int m2 = min(fsize[1], zsize[1]-y);

int m3 = fsize[2]-skip;
for (int i=0; i<m1; i++)
for (int j=0; j<m2; j++)
for (int k=0; k<m3; k++)
sum += f[idf(i, j, k + skip)]*z[id(x+i, y+j, t+k + skip)];
m1 = min(x+1, fsize[0]);
m2 = min(y+1, fsize[1]);
m3 = min(t+1, fsize[2]);
for (int i=m1-1; i>=0; --i)
for (int j=m2-1; j>=0; --j)
for (int k=m3-1; k>=0; --k)
sum += f[idf(i, j, k)]*z[id(x-i, y-j, t-k)];
z[id(x, y, t)] += sum;
}
}
}
}

2. -
7. OpenCL ( -).
kernel
void generate_water_surface_lh(
global Real* coefs,
//
Size2 coefs_size,
// - .
Domain2 sdom,
//
size3 zsize,
//
Real3 zdelta,
//
global float* white_noise,
//
global float* surface,

//
local float* reduc_buf)
//
{
const uint l = get_group_id(0);
const uint m = get_group_id(1);
const uint n = get_group_id(2);
const uint u = get_local_id(0);
const uint p = get_local_size(0);
const Real x = l*zdelta.x;
const Real y = m*zdelta.y;
const Real t = n*zdelta.z;
const Real2 d = delta2(sdom);
reduc_buf[u] = 0.0;
for (uint c=u; c<COEFS_SIZE_X*COEFS_SIZE_Y; c+=p) {
Real w = mad(d.x, ((c/COEFS_SIZE_Y) % (COEFS_SIZE_X - 0)), sdom.min.x);
Real teta = mad(d.y, (c % (COEFS_SIZE_Y - 0)), sdom.min.y);
reduc_buf[u] += coefs[c]*cos(w*(w*native_divide(cos(teta)*x + sin(teta)*y,
9.8154) - t) + 2.0*PI*white_noise[c]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (u != 0) return;
Real sum = 0.0;
for (uint i=0; i<p; ++i)
sum += reduc_buf[i];
surface[index(l, m, n, zsize)] = sum;
}
8. OpenMP ( -).
void generate_water_surface(
const Discrete<Real>& coefs,
//

const size2& coefs_size,


// - .
const Discrete<Real>& spec,
//
const Domain<Real, 2>& sdom,
//
const Discrete<Real>& white_noise,
//
const size3& zsize,
//
const Real3& zdelta,
//
Discrete<Real>& surface,
//
uint part_count)
// ()
const Index<2> id(coefs_size);
const Index<3> idz(zsize);
const Real3 delta = zdelta;
const Real2 d = sdom.delta();
const Real2 min = sdom.min();
uint count = zsize;
#pragma omp parallel for collapse(3) shared(white_noise) num_threads(part_count)
for (uint n=0; n<zsize[2]; n++) {
for (uint l=0; l<zsize[0]; l++) {
for (uint m=0; m<zsize[1]; m++) {
Real x = l*delta[0];
Real y = m*delta[1];
Real t = n*delta[2];
Real sum = 0;
for (uint i=0; i<coefs_size[0]; i++) {
Real w = min[0] + i*d[0];
for (uint j=0; j<coefs_size[1]; j++) {
Real theta = min[1] + j*d[1];
Real si, co;

sincosf(theta, &si, &co);


Real kx = w*w*co/Real(9.8154);
Real ky = w*w*si/Real(9.8154);
uint idx = id(i, j);
sum += coefs[idx]*cosf(kx*x + ky*y - w*t +
Real(2)*PI*white_noise[idx]);
}
}
surface[idz(l, m, n)] = sum;
}
}
}
}
9. MPI ( -).
void generate_water_surface(
const Discrete<Real>& coefs,
//
const size2& coefs_size,
// - .
const Discrete<Real>& spec_fd,
//
const Domain<Real, 2>& sdom,
//
const Discrete<Real>& white_noise,
//
const size3& zsize,
//
const Real3& delta,
//
Discrete<Real>& surface,
//
uint part_count)
// ()
{
const Index<2> id(coefs_size);
const Index<3> idz(zsize);
const Real2 d = sdom.delta();
const Real2 min = sdom.min();

uint part_size_x = zsize[0]/proc::count;


for (uint l=0; l<part_size_x; l++) {
for (uint m=0; m<zsize[1]; m++) {
for (uint n=0; n<zsize[2]; n++) {
Real x = l*delta[0];
Real y = m*delta[1];
Real t = n*delta[2];
Real sum = 0.0;
for (uint i=0; i<coefs_size[0]; i++) {
for (uint j=0; j<coefs_size[1]; j++) {
Real w = min[0] + i*d[0];
Real theta = min[1] + j*d[1];
Real kx = w*w*cos(theta)/9.8154;
Real ky = w*w*sin(theta)/9.8154;
sum += coefs[id(i, j)]*cos(kx*x + ky*y - w*t + 2.0*PI*white_noise[id(i,
j)]);
}
}
surface[idz(l, m, n)] = sum;
}
}
}
}