Открыть Электронные книги
Категории
Открыть Аудиокниги
Категории
Открыть Журналы
Категории
Открыть Документы
Категории
........................................................................................................... 1
............................................................................................. 2
..........2
Intel Nehalem............................................ 3
AMD Istanbul............................................ 4
1.1. IBM Cell.............................................. 5
2. .................6
........................................................................... 7
.......................................................................... 8
3D .......................................................................... 9
................................................................ 10
Nvidia.................................. 11
G80..................................................................... 12
G200................................................................... 13
Fermi................................................................. 14
CUDA........................................................ 17
...................................................................... 20
................................................................ 22
CUDA..................................................................................... 22
............................................................................... 23
................................................................................. 24
CUDA............................................................................ 25
.............................................................. 28
.................................................................................... 29
()...................................... 31
...............31
............................................................ 33
. . .................................. 35
. . ................................. 36
........................................................... 40
............................................................. 40
................................................ 42
cudaArray ............................................... 43
()................................................ 46
........................................................................ 47
..................................................................................................... 49
, ,
, . ,
.
, ,
. 2006
Nvidia CUDA,
()
Nvidia.
, ,
.
CUDA,
, , ,
,
.
CUDA
, ,
,
,
.
( 1).
1. .
Single Data
Multiple Data
Single Instruction
SISD
SIMD
Intel
Amd
Cell
Multiple Instruction
MISD
MIMD
Intel Nehalem
1. Intel Nehalem
1, Intel Nehalem 4
,
.
, ,
.
,
.
QPI (QuickPath Interconnect)
. IMC (Integrated
Memory Controller) ,
. Intel
.
AMD Istanbul
2. AMD Istanbul
2 , AMD Istanbul 6
, ,
Intel. , AMD
Istanbul, Intel Nehalem 3 ,
,
.
,
Intel AMD .
MIMD , ,
SISD . ,
( 2).
2.
Intel AMD
, ,
,
(80 128
)
, ,
,
.
3. IBM Cell
Cell
(. 3). SPE (Synergistic Processing Elements),
. Cell SIMD ,
,
. Cell
.
3.
IBM Cell
2.
,
,
.
.
.
,
:
3D
4. ""
,
,
DHCP .
, , , ,
( 4).
4.
4.
(
)
5. ""
. , -
.
,
( 5).
5.
3D
6. "3D "
3D .
4 ,
( 6).
6. 3D
, .
2 ,
(1/ Ethernet)
. ,
(10/ Myrinet, 40/ Infiniband), .
5 .
- .
,
,
- (
Cell, ).
.
, ,
.
(Host)
.
(Device) ,
().
(Kernel) ,
.
(Grid) ,
.
(Block) ,
SM. .
(Thread) .
.
(Warp) 32 ,
.
Nvidia
Nvidia.
.
( 7).
7.
TPC (Texture process cluster)
(G80 8,
G200 10),
(GeForce 220GT 2, GeForce 275 10). ,
TPC,
DRAM
. DRAM
, , TPC.
CPU , ,
8. G80
TPC:
TEX , ,
.
SM ,
, .
G80 TPC 2 SM.
SM:
SP , ,
(
). , SM.
SFU .
(exp, sqr, log).
SP.
, SM
32. .
,
.
SM 16 .
(). .
,
SM. .
9. G200
G200 G80:
SM TPC 3.
SP 240.
,
8 SP. , double 8
float. -
.
Fermi
10. Fermi
Fermi :
(SM):
- 32 CUDA SM, , GT200;
- FP-
;
- Warp Scheduler SM ;
- 64
L1-.
(Parallel Thread Execution, PTX 2.0):
-
++;
- OpenCL DirectCompute;
- 32- 64- IEEE 7542008;
- 64-
;
- .
:
- NVIDIA Parallel DataCache L1 L2;
- ECC, GPU;
-
.
NVIDIA GigaThread:
-
;
- ;
- .
( 7)
7. Nvidia
G80
GT200
Fermi
2006
2008
2009
, .
681
1400
3000
CUDA-
128
240
512
SM,
16
16
48 16
()
-
SM,
16 48
()
-
,
768
ECC
, ,
,
.
Compute
Capability, ,
.
1.0, 1.1, 1.2, 1.3, 2.0 Compute Capability. , Compute
Capability .
8. Compute Capability
GPU
GeForce 8800GTX
GeForce 9800GTX
GeForce 210
GeForce 275GTX
Tesla C2050
Compute Capability
1.0
1.1
1.2
1.3
2.0
CUDA
.
CUDA.
:
(Thread) .
.
(Warp) 32 ,
.
(Block) ,
SM.
.
(Grid) ,
.
(Kernel) ,
.
(Device) ,
().
(Host) ,
.
, ,
,
,
. ,
.
:
11.
11 ,
32, 32 ,
,
.
,
11
9.
65536
512
65536
512
64
4294967296
512
,
. ,
.
.
, , 2 TPC 10.
,
.
SM. .
SM 8 .
Nvidia
NVCC
, *.cu, ,
CUDA . :
( )
C
CUDA ,
,
( 10).
10.
__device__
device
device
__global__
device
host
__host__
host
host
__global__ ,
(
). __global__ void.
__global__ .
__host__ __device__
,
( ).
__host__ .
( Fermi)
:
( __global__)
, , ,
static
CUDA ,
,
SIMD ,
.
, CUDA
,
( 11).
11.
__device__
device
device
__constant__
device
device / host
R/W
__shared__
device
block
RW / __syncthreads()
__device__
__shared__
, ,
.
__constant__
CPU.
, ,
.
union.
const
CUDA
CUDA ,
.
CUDA ,
,
.
dim3 gridDim
.
uint3 blockIdx .
dim3 blockDim .
uint3 threadIdx .
int warpSize ( 32).
.
params:
__global__ void Kernel_name(params);
:
Kernel_name<<<grid, block, mem, stream>>> ( params ),
dim3 grid .
.
dim3 block .
.
size_t mem -- ,
.
cudaStream_t stream , .
:
#define BS 256 //
#define N 1024 //
//
__global__ void kernel (int* data){
//
.
int idx = blockIdx.x * BS + threadIdx.x;
some code
}
int main (){
//
int* data;
// ( N ,
)
dim3 block = dim3(BS);
dim3 grid = dim3(N / BS);
some code
//
CUDA
,
CUDA API.
.
CUDA API: -API
runtime-API. Runtime-API -API,
, .
API
. API cu, runtime-API
cuda. API
t_cudaError, cudaSuccess
.
Runtime-API.
API .
, .
, ,
.
.
Runtime-API:
char* cudaGetErrorString(cudaError_t)
. cudaError_t cudaGetLastError()
,
.
cudaError_t cudaThreadSynchronize()
.
cudaError_t cudaGetDeviceCount(int *)
, CUDA.
cudaError_t cudaGetDevicePropertis (cudaDeviceProp * props, int deviceNo )
.
, Compute Capability major minor
cudaDeviceProp.
, Runtime-API ,
,
.
CUDA:
32/64/128-
t[i]
o
256
4/8/16
__align__(size)
12.
CUDA.
12. CUDA
R/W
Per-thread
(on-chip)
R/W
Per-thread
(DRAM)
Shared
R/W
Per-block
(on-chip)
R/W
Per-grid
(DRAM)
Constant
R/O
Per-grid
(L1 cache)
Texture
R/O
Per-grid
(L1 cache)
32 SM,
. ,
. .
, ,
DRAM . .
16 ( 48 Fermi) SM,
, .
,
( 10 ). .
(
6 Tesla c2070). .
DRAM
( 80 ). .
, DRAM ,
.
, .
.
, DRAM , .
,
.
CUDA
.
, ,
.
.
.
, CUDA,
, ( ,
).
.
(),
(),
.
.
.
,
:
cudaError_t cudaMalloc ( void ** devPtr, size_t size );
size devPtr
cudaError_t cudaMallocPitch ( void ** devPtr, size_t * pitch, size_t width,
size_t height );
width * height,
devPtr pitch
cudaError_t cudaFree ( void * devPtr );
devPtr
dst
src
count
kind :
cudaMemcpyHostToDevice c
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice
cudaMemcpyHostToHost
cudaError_t cudaMemcpyAsync ( void * dst, const void * src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream );
cudaMemcpy stream.
:
,
1
( 13).
#include <stdio.h>
#include <time.h>
#define CUDA_FLOAT float
#define GRID_SIZE 256
#define BLOCK_SIZE 256
13.
// cuda API
void check_cuda_error(const char *message)
{
cudaError_t err = cudaGetLastError();
if(err!=cudaSuccess)
printf("ERROR: %s: %s\n", message, cudaGetErrorString(err));
}
CUDA_FLOAT s = 0; // ,
CUDA_FLOAT x1, y1;
x1 = x0 + dx;
y1 = sqrtf(1 - x1 * x1);
s = (y0 + y1) * dx / 2.f;
//
res[n] = s;
//
}
int main(int argc, char** argv)
{
cudaSetDevice(DEVICE);//
check_cuda_error("Error selecting device");
CUDA_FLOAT *res_d;
//
CUDA_FLOAT res[GRID_SIZE * BLOCK_SIZE];
//
cudaMalloc((void**)&res_d, sizeof(CUDA_FLOAT) * GRID_SIZE *
BLOCK_SIZE);
// CPU
check_cuda_error("Allocating memory on GPU");
// GPU
dim3 grid(GRID_SIZE);
dim3 block(BLOCK_SIZE);
pi_kern<<<grid, block>>>(res_d);
//
cudaThreadSynchronize();
//
check_cuda_error("Executing kernel");
cudaMemcpy(res, res_d, sizeof(CUDA_FLOAT) * GRID_SIZE * BLOCK_SIZE,
cudaMemcpyDeviceToHost);
//
check_cuda_error("Copying results from GPU");
cudaFree(res_d);
// GPU
check_cuda_error("Freeing device memory");
CUDA_FLOAT pi = 0;
for (int i=0; i < GRID_SIZE * BLOCK_SIZE; i++)
{
pi += res[i];
}
pi *= 4;
printf("PI = %.12f\n",pi);
return 0;
CUDA_FLOAT,
float double,
.
, ,
.
,
.
()
,
,
. 14.
14.
() :
half-warp
32/64/128
, Compute
Capability ( 13):
13.
1.0, 1.1
>= 1.2
32- ,
64-
64- ,
128-
16
k- half-warp
k-
8- ,
32-
16- ,
64-
32- ,
128-
15.
16. 1.0,1.1
, ,
16 .
16
, 32 , ,
.
.
, ,
, .
,
,
, .
__shared__.
__syncthreads();
:
#define BS 256; //
__global__ void kern(float* data){
// float
// :
__shared__ float a[BS];
int idx = blockIdx.x * BS +
threadIdx.x;
//
a[idx] = data[idx];
//
. .
__syncthreads();
//
data[idx] = a[idx]+ a[(idx +
1) % BS];
}
, 17. float
float
32
,
float .
#define BS 256; //
__global__ void kern(short* data){
// float
// :
__shared__ short a[BS];
int idx = blockIdx.x * BS +
threadIdx.x;
//
a[idx] = data[idx];
//
. .
__syncthreads();
//
data[idx] = a[idx]+ a[(idx
+ 1) % BS];
}
18. short
2 , short 16 ,
short .
char, 4 , char 8
.
,
, .
. .
:
WA, WB
BS
( )
BX = blockIdx.x
BY = blockIdx.y
TX = threadIdx.x
TY = threadIdx.y
.
C.
A B.
,
!
19.
#define BLOCK_SIZE 16
__global__ void matMult ( float .
* a, float * b, float * , int wa, int wb )
{
int
bx
= blockIdx.x;
int
by
= blockIdx.y;
int
tx
= threadIdx.x;
int
ty
= threadIdx.y;
float sum = 0.0f;
int
int
int
for (
sum
c [ic
ia = wa * BLOCK_SIZE *
ib = BLOCK_SIZE * bx +
ic = wb * BLOCK_SIZE *
int k = 0; k < n; k++ )
+= a [ia + k] * b [ib +
+ wb * ty + tx] = sum;
by + wa * ty;
tx;
by + BLOCK_SIZE * bx;
k * wb];
, :
2 * WA
2 * WA
, ,
( 20)
20.
, ( 85%)
. .
. .
,
, ,
. ,
:
= A1 * B1 + A2 * B2 + .
.
!
,
,
:
WA / 8
2 * WA
21.
.
( 22):
22.
81% ,
13% ( 20). .
.
,
,
.
__constant__ float contsData [256]; --
contsData
.
cudaMemcpyToSymbol ( constData, hostData, sizeof ( data ), 0,
cudaMemcpyHostToDevice ); --
.
.
.
, ,
,
- ,
,
,
.
.
(
, , ),
.
:
shared
:
texture< type , dim, tex_type> g_TexRef;
Type
Dim (1, 2, 3)
Tex_type
o cudaReadModeNormalizedFloat
o cudaReadModeElementType
,
:
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
int x, y, z, w; - [0,32]
cudaChannelFormatKind
o cudaChannelFormatKindSigned int
o cudaChannelFormatKindUnsigned
int
o cudaChannelFormatKindFloat float
CUDA cudaArray ( 14).
14. CUDA
cudaArray
1D/ 2D/3D :
1/2/4
8/16/32 bit signed/unsigned integers
32 bit float
16 bit float (driver API)
tex1D()/tex2D()/tex3D()
: tex1Dfetch(tex, int)
.
,
. ,
.
.
:
cudaError_t cudaBindTexture(size_t shift, texref tex,
&src, size_t size));
Shift (
)
Tex
Src
Size
(
, ):
cudaError_t
tex, &src,
pitch);
cudaBindTexture2D(size_t
shift,
texref
&channelDesc, int width,int height,int
Shift (
)
Tex
Src
channelDesc ( )
width
height
pitch
:
cudaError_t cudaUnbindTexture(texref tex);
.
,
:
tex1Dfetch(texRef tex, int index);
Tex
Index
cudaArray .
cudaArray, .
cudaArray , 2
, ,
:
Clamp
Wrap
( float )
Point
Linear
cudaArray
cudaArray:
cudaArray * a;
:
cudaError_t cudaMallocArray(struct cudaArray **
arrayPtr, const struct cudaChannelFormatDesc *
channelDesc, size_t width, size_t
height);
arrayPtr cudaArray
channelDesc ( )
width
height
:
cudaError_t cudaMemcpyToArray(struct cudaArray * dst,
size_t wOffset, size_t hOffset, const void * src,
size_t count, enum cudaMemcpyKind kind)
arrayPtr cudaArray
wOffset
hOffset
Src ,
count
kind ( cudaMemcpy)
,
cudaArray :
cudaError_t
cudaBindTextureToArray
(const
struct
textureReference *tex, const struct cudaArray *array,
const struct cudaChannelFormatDesc *desc);
Tex
array
channelDesc ( )
cudaArray,
.
,
:
tex1D (texRef tex, float x);
Tex
x, y
x, y, z
(
23).
23.
()
- :
x1 y1 z1
x2 y2 z2
x3 y3 z3
..
1000
-- 5
, CUDA,
. .
,
,
, ,
. .
2
: -. 3
.
,
1 ,
.
.
().
:
, , ,
: .
:
: {0,1}
0 1
6 7
1 1 4, 5,
6 7
0
:
WX WY WZ T
w1 x1 y1 z1
...
wm xm ym zm
0000
WX, WY, WZ ,
T :
1
wi i- ( 1)
xi, yi, zi i- ,
:
./life
test.in
100
test.out
%
5 == 0
:
w11 x11 y11 z11
0000
, CUDA
,
.
,
.
, ,
.
,
, (
).
. ,
( ), ,
, .