Академический Документы
Профессиональный Документы
Культура Документы
:
:
___ 2011 .
______________________( ..)
()
230100
MPI, OpenCL OpenMP
________________________ ( . . )
()
( ..)
__________________ ( . .)
()
( ..)
2011 .
1.
2.
2.1 -
2.2 -....
2.3 ..
2.4
3. -..............
4. -
11
5. ...........
12
6. ............
13
1. ..
2. .............................................................
1.
, . ,
20 300
. .
,
.
,
,
.
,
.
, ,
, .
.
, .
,
.
2.
, .
2.1 -
, - [5].
cn n, [0,
2]:
( x,y,t )= c n cos ( u n x+v n yw n t+ n ) .
(1)
- , (x,y,t)
,
2
2S ( u,v ) du dv= c n .
n
-
, .
- , (1)
. , -
,
, (1).
(
) -
.
2.2 -
-
OpenCL, OpenMP MPI,
.
- ,
[5]
S ( ) =Ak exp [ Bn ] Q ( , ) ,
Q(, ) . A, B, k, n
,
1964 [5]
A= 0 . 28 ( 2 ) 4 h 2 4 ,B=0 . 44 ( 2 ) 4 4 ,k=5, n= 4 ,
h, t . ,
, cn
c n= 2
n+1
S ( ) d .
(1).
2.3
[6].
-
,
( ). ,
[1].
p1
p2
p3
(2)
i=0 j=0 k= 0
,
-. ,
,
A = b, ai,j = |x(i)-x(j)|,|y(i)-y(j)|,|t(i)-t(j)|, bi = x(i),y(i),t(i)
x(i) = mod((i+1)/(p1p2), p1) , y(i) = mod((i+1)/p3, p2) , t(i) = mod(i+1, p3) .
,
(0.1-0.7) (0.1-0.52) [6].
, .
z=f(y) F(z)
F(z) = (y),
(3)
(y) .
.
- [5, 9],
K z ( x,y,t )= C 2m
m= 0
K y ( x,y,t )
1
y2
, C m=
f
(
y
)
H
(
y
)exp(
) dy ,
m
m!
2
2
(4)
. -,
, [1, 6],
.
p .
, ,
i i .
.
(x, y, t) = e- (|x|+|y|+|t|)cos(x)cos(y)cos(t)
, ,
.
.
(z) = F(z) - 2T(z, ) ,
T(z, ) , F(z) .
=1.2 . 1.
(3)
, ,
,
().
N
z=f ( y ) d i yi .
i=0
12 500
[-5z, 5z].
, ,
.
;
10-5 0.4310-3.
(4),
[5, 9]
2z =
C 2m
.
m!
-. ,
, . -
,
.
( ).
(Mersenne Twister) 219937-1.
, dcmt.
[7, 9].
. 2.
.
, .
,
. ,
.
( ) [2].
z x,y,t = i,j,k z xi,y j,z k + i,j,k z x+i,y+j,z+k + x,y,t .
i
3.
-
- :
OpenCL, OpenMP MPI,
, .
.
,
.
.
, .
(FPU), . ,
2 1
,
100 ( ) [8, 11].
OpenCL .
(2).
( ) ,
. ( ,
)
. 3. 2 1-2 OpenCL
.
. 3. () ()
() (b) .
OpenCL
.
() ,
[8]. ,
( 2 ).
,
.
. OpenCL
, OpenMP . MPI
;
, ,
. .
. 4. , ( ).
,
- ,
. ,
.
:
1) gcc (SUSE Linux) 4.3.2 [gcc-4_3-branch revision 141291] OpenMP,
MPI
2) llvm version 2.6svn (Built Jan 18 2011(17:37:10)) OpenCL.
3) NVIDIA version 260.19.36
4) MPICH version: 1.2.7 (release date 2005/06/22 16:33:49)
:
1) gnuplot 4.2 patchlevel 6
2) NVIDIA Compute Visual Profiler Version 3.2.0
4.
-
-
. : ,
, .
.
:
1) NVIDIA 8800GT, 14 :
8 CUDA (1,5 )
2
1 (FPU)
1
2) 10 :
1-2
. 6-8. 1
. , -
, ,
.
, (1),
. OpenCL -
,
.
1. OpenCL, OpenMP MPI -
( ).
-
OpenCL OpenMP MPI OpenCL OpenMP MPI
400000
0.82
40,44 32,60
1,80
0,080 0,075
440000
0.90
44,59 35,78
1,92
0,100 0,093
480000
0.99
48,49 38,93
2,29
0,097 0,126
520000
1.07
52,65 41,92
2,43
0,118
0,117
560000
1.15
56,45 45,00
2,51
0,117
0,161
600000
1.23
60,85 48,80
2,54
0,123 0,132
640000
1.31
65,07 53,02
2,73
0,123 0,160
680000
1.40
68,90 54,92
2,80
0,138 0,136
720000
1.48
72,49 58,42
2,88
0,144
760000
1.56
76,86 61,41
3,47
0,156 0,155
800000
1.64
81,03 66,42
3,25
0,166
0,173
0,174
OpenCL -
50 (. 5).
,
OpenMP MPI
.
. 5. (AR) - (LH)
.
. -
,
.
OpenCL OpenMP, .
. 6.
. 6. (
) (AR) - (LH)
.
()
OpenMP MPI. ,
, (. 7).
.
OpenCL ,
(occupancy), ,
[12]. 2
.
, - ,
.
2. OpenCL
(occupancy)
(generate_water_surface,
connect_wave_surface_parts3)
0,167
(generate_water_surface_lh)
0,5
(oclMersenneTwister, NVIDIA
SDK)
0,667
(oclTranspose,
NVIDIA SDK)
1,0
. 7. ( )
(AR) - (LH)
.
.
(
OpenMP ),
.
,
, , 2%
(. 8).
. 8. , ,
.
5.
,
,
. -
. ,
MPI,
.
.
- ,
-. OpenCL
.
.
. -
( ),
.
,
.
6.
1. ., . . .: , 1974
2. .., ..
. //. 2003, .-, 2003, .2, .64-68
3. . ., . ., . . . .:
, 1977
4. ., . . .: , 1999
5. . ., . ., . .
. .: , 1978
6. . ., . . . .:
, 1990
7. Makoto Matsumoto, Takuji Nishimura: Dynamic Creation of Pseudorandom Generators
8. OpenCL Programming Guide for the CUDA Architecture (2010)
9. Podlozhnyuk V. Parallel Mersenne Twister (2007)
10. Ruud van der Pas: OpenMP and performance. In: International Workshop on OpenMP, Dresden
(2009)
11. Intel 64 and IA-32 Architectures Optimization Reference Manual (2011)
12. Compute Visual Profiler User Guide (2010)
1.
,
: ,
, , , , .
. 7-8.
. 7. .
. 8. -.
2.
1.
1. OpenCL ( ).
kernel
void generate_wave_surface(
global float* f_src,
//
size3 fsize,
// .
global float* z,
//
size3 zsize,
//
int part_size,
//
int interval,
//
local float* f,
//
local float* sum,
//
size3 offset)
{
const int part = get_group_id(0);
const int p = get_local_size(0);
uint t0 = part*part_size;
uint t1 = min(t0 + part_size - interval, zsize[2]);
for (uint x=0; x<zsize[0]; x++) {
for (uint y=0; y<zsize[1]; y++) {
for (uint t=t0; t<t1; t++) {
int m1 = min(x+1u, fsize[0]);
int m2 = min(y+1u, fsize[1]);
int m3 = min(t+1u, fsize[2]);
Real sum = 0;
for (int i=m1-1; i>=0; --i)
for (int j=m2-1; j>=0; --j)
for (int k=m3-1; k>=0; --k)
sum += f[f.size()-1-id(i, j, k)]*Z[idz(x-i, y-j, t-k)];
Z[idz(x, y, t)] += sum;
}
}
}
}
}
4. OpenMP ( ).
void connect_water_surface_parts(
const Discrete<Real>& f,
// .
const size3& fsize,
// - .
Discrete<Real>& z,
//
const size3& zsize,
//
uint part_size,
//
uint interval)
//
{
const Index<3> id(fsize);
skip)];
}
int m3 = fsize[2]-skip;
for (int i=0; i<m1; i++)
for (int j=0; j<m2; j++)
for (int k=0; k<m3; k++)
sum += f[idf(i, j, k + skip)]*z[id(x+i, y+j, t+k + skip)];
m1 = min(x+1, fsize[0]);
m2 = min(y+1, fsize[1]);
m3 = min(t+1, fsize[2]);
for (int i=m1-1; i>=0; --i)
for (int j=m2-1; j>=0; --j)
for (int k=m3-1; k>=0; --k)
sum += f[idf(i, j, k)]*z[id(x-i, y-j, t-k)];
z[id(x, y, t)] += sum;
}
}
}
}
2. -
7. OpenCL ( -).
kernel
void generate_water_surface_lh(
global Real* coefs,
//
Size2 coefs_size,
// - .
Domain2 sdom,
//
size3 zsize,
//
Real3 zdelta,
//
global float* white_noise,
//
global float* surface,
//
local float* reduc_buf)
//
{
const uint l = get_group_id(0);
const uint m = get_group_id(1);
const uint n = get_group_id(2);
const uint u = get_local_id(0);
const uint p = get_local_size(0);
const Real x = l*zdelta.x;
const Real y = m*zdelta.y;
const Real t = n*zdelta.z;
const Real2 d = delta2(sdom);
reduc_buf[u] = 0.0;
for (uint c=u; c<COEFS_SIZE_X*COEFS_SIZE_Y; c+=p) {
Real w = mad(d.x, ((c/COEFS_SIZE_Y) % (COEFS_SIZE_X - 0)), sdom.min.x);
Real teta = mad(d.y, (c % (COEFS_SIZE_Y - 0)), sdom.min.y);
reduc_buf[u] += coefs[c]*cos(w*(w*native_divide(cos(teta)*x + sin(teta)*y,
9.8154) - t) + 2.0*PI*white_noise[c]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (u != 0) return;
Real sum = 0.0;
for (uint i=0; i<p; ++i)
sum += reduc_buf[i];
surface[index(l, m, n, zsize)] = sum;
}
8. OpenMP ( -).
void generate_water_surface(
const Discrete<Real>& coefs,
//