Вы находитесь на странице: 1из 49

CUDA.

........................................................................................................... 1
............................................................................................. 2
..........2
Intel Nehalem............................................ 3
AMD Istanbul............................................ 4
1.1. IBM Cell.............................................. 5
2. .................6
........................................................................... 7
.......................................................................... 8
3D .......................................................................... 9
................................................................ 10
Nvidia.................................. 11
G80..................................................................... 12
G200................................................................... 13
Fermi................................................................. 14
CUDA........................................................ 17
...................................................................... 20
................................................................ 22
CUDA..................................................................................... 22
............................................................................... 23
................................................................................. 24
CUDA............................................................................ 25
.............................................................. 28
.................................................................................... 29
()...................................... 31

...............31
............................................................ 33
. . .................................. 35
. . ................................. 36
........................................................... 40
............................................................. 40
................................................ 42
cudaArray ............................................... 43
()................................................ 46
........................................................................ 47
..................................................................................................... 49


, ,
, . ,
.
, ,
. 2006
Nvidia CUDA,
()
Nvidia.
, ,
.

CUDA,
, , ,
,
.

CUDA
, ,

,
,
.

( 1).
1. .

Single Data
Multiple Data

Single Instruction
SISD
SIMD

Intel

Amd

Cell

Multiple Instruction
MISD
MIMD

Intel Nehalem

1. Intel Nehalem

1, Intel Nehalem 4
,
.
, ,
.
,
.
QPI (QuickPath Interconnect)
. IMC (Integrated
Memory Controller) ,
. Intel
.
AMD Istanbul

2. AMD Istanbul

2 , AMD Istanbul 6
, ,
Intel. , AMD
Istanbul, Intel Nehalem 3 ,
,
.

,
Intel AMD .
MIMD , ,
SISD . ,
( 2).
2.
Intel AMD

, ,
,




(80 128
)

, ,
,






.

1.1. IBM Cell

3. IBM Cell

Cell
(. 3). SPE (Synergistic Processing Elements),

. Cell SIMD ,
,
. Cell
.

3.
IBM Cell

2.
,
,
.


.
.
,
:

3D

4. ""

,
,
DHCP .
, , , ,
( 4).
4.
4.


(
)

5. ""


. , -
.
,
( 5).
5.

3D

6. "3D "

3D .
4 ,
( 6).
6. 3D

, .
2 ,
(1/ Ethernet)
. ,
(10/ Myrinet, 40/ Infiniband), .

5 .


- .
,
,
- (
Cell, ).
.


, ,
.

(Host)
.

(Device) ,
().

(Kernel) ,
.

(Grid) ,
.

(Block) ,
SM. .

(Thread) .
.

(Warp) 32 ,
.

Nvidia

Nvidia.


.
( 7).

7.


TPC (Texture process cluster)
(G80 8,
G200 10),
(GeForce 220GT 2, GeForce 275 10). ,
TPC,
DRAM
. DRAM

, , TPC.
CPU , ,

CPU (Intel Core i7), (Intel Core 2


Duo).
( G80 G200) TPC,
. Fermi
,
.
G80

8. G80

TPC:

TEX , ,

.

SM ,
, .
G80 TPC 2 SM.
SM:

SP , ,

(
). , SM.

SFU .
(exp, sqr, log).
SP.

, SM
32. .

,
.
SM 16 .


(). .

,
SM. .

, G80 128 (SP)



.
, .
G200

9. G200

G200 G80:

SM TPC 3.

SP 240.

,

8 SP. , double 8
float. -
.

Fermi

10. Fermi

Fermi :

(SM):
- 32 CUDA SM, , GT200;
- FP-
;
- Warp Scheduler SM ;
- 64
L1-.

(Parallel Thread Execution, PTX 2.0):
-
++;
- OpenCL DirectCompute;
- 32- 64- IEEE 7542008;
- 64-

;
- .
:
- NVIDIA Parallel DataCache L1 L2;
- ECC, GPU;
-
.
NVIDIA GigaThread:
-
;
- ;
- .
( 7)
7. Nvidia

G80

GT200

Fermi

2006

2008

2009

, .

681

1400

3000

CUDA-

128

240

512


SM,

16

16

48 16
()

-
SM,

16 48
()

-
,

768

ECC


, ,
,
.

Compute
Capability, ,
.
1.0, 1.1, 1.2, 1.3, 2.0 Compute Capability. , Compute
Capability .
8. Compute Capability

GPU
GeForce 8800GTX
GeForce 9800GTX
GeForce 210
GeForce 275GTX
Tesla C2050

Compute Capability
1.0
1.1
1.2
1.3
2.0

CUDA
.

CUDA.
:

(Thread) .
.

(Warp) 32 ,
.

(Block) ,
SM.
.

(Grid) ,
.

(Kernel) ,
.

(Device) ,
().

(Host) ,
.

, ,
,
,
. ,
.

:

11.

11 ,
32, 32 ,
,
.
,
11

9.

65536

512

65536

512

64

4294967296
512
,
. ,
.
.
, , 2 TPC 10.
,
.
SM. .
SM 8 .

Nvidia
NVCC
, *.cu, ,
CUDA . :


( )
C


CUDA ,
,
( 10).
10.

__device__

device

device

__global__

device

host

__host__

host

host

__global__ ,
(
). __global__ void.
__global__ .

__host__ __device__
,
( ).

__host__ .

( Fermi)
:

( __global__)

, , ,

static

CUDA ,
,
SIMD ,
.


, CUDA
,
( 11).
11.

__device__

device

device

__constant__

device

device / host

R/W

__shared__

device

block

RW / __syncthreads()

__device__

__shared__
, ,
.

__constant__
CPU.
, ,
.

union.

const

CUDA
CUDA ,
.

(u/) char, char2, char3, char4


(u/) int, int2, int3, int4
float, float2, float3, float4
longlong, longlong2
double, double2

make_()(), :
char2 a = make_char2(a,b);
printf(%c %c, a.x, a.y);
float4 b =make_float4(1.0, 2.0, 3.0, 4.0);
printf(%f %f, b.x, b.y, b.z, b.w);
cuda > 3.0 .
dim3, uint3,

.
.
dim3 block = dim3 (16,16, 2);
dim3 grid = dim3 (1000);
, !
dim3 block = dim3 (16,16, 0);
0!
.


CUDA ,
,
.
dim3 gridDim
.

uint3 blockIdx .
dim3 blockDim .
uint3 threadIdx .
int warpSize ( 32).



.
params:
__global__ void Kernel_name(params);
:
Kernel_name<<<grid, block, mem, stream>>> ( params ),
dim3 grid .
.
dim3 block .
.
size_t mem -- ,
.
cudaStream_t stream , .
:
#define BS 256 //
#define N 1024 //
//
__global__ void kernel (int* data){
//
.
int idx = blockIdx.x * BS + threadIdx.x;
some code
}
int main (){

//
int* data;
// ( N ,
)
dim3 block = dim3(BS);
dim3 grid = dim3(N / BS);
some code
//

kernel <<<grid, block>>> (data);


some code
}
, .
,
.
CUDA.

CUDA
,
CUDA API.
.
CUDA API: -API
runtime-API. Runtime-API -API,
, .
API


. API cu, runtime-API
cuda. API
t_cudaError, cudaSuccess
.
Runtime-API.
API .
, .
, ,
.
.

Runtime-API:
char* cudaGetErrorString(cudaError_t)
. cudaError_t cudaGetLastError()
,
.
cudaError_t cudaThreadSynchronize()
.

cudaError_t cudaGetDeviceCount(int *)
, CUDA.
cudaError_t cudaGetDevicePropertis (cudaDeviceProp * props, int deviceNo )
.
, Compute Capability major minor
cudaDeviceProp.
, Runtime-API ,
,
.
CUDA:


32/64/128-

t[i]
o

sizeof(t [0]) 4/8/16

t [i] sizeof ( t [0] )


256


4/8/16
__align__(size)
12.

CUDA.
12. CUDA

R/W

Per-thread

(on-chip)

R/W

Per-thread

(DRAM)

Shared

R/W

Per-block

(on-chip)

R/W

Per-grid

(DRAM)

Constant

R/O

Per-grid

(L1 cache)

Texture

R/O

Per-grid

(L1 cache)

32 SM,
. ,
. .

, ,
DRAM . .
16 ( 48 Fermi) SM,
, .
,
( 10 ). .
(
6 Tesla c2070). .
DRAM
( 80 ). .
, DRAM ,
.
, .
.

, DRAM , .
,
.
CUDA
.

, ,
.
.
.
, CUDA,
, ( ,
).
.
(),
(),
.
.
.
,
:
cudaError_t cudaMalloc ( void ** devPtr, size_t size );
size devPtr
cudaError_t cudaMallocPitch ( void ** devPtr, size_t * pitch, size_t width,
size_t height );
width * height,
devPtr pitch
cudaError_t cudaFree ( void * devPtr );
devPtr

cudaError_t cudaMemset (void* devPtr, int value, size_t count );


devPtr value count
cudaError_t cudaMemcpy
cudaMemcpyKind kind );

( void * dst, const void * src, size_t count, enum


dst
src
count
kind :
cudaMemcpyHostToDevice c
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice
cudaMemcpyHostToHost
cudaError_t cudaMemcpyAsync ( void * dst, const void * src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream );
cudaMemcpy stream.

:


,
1
( 13).
#include <stdio.h>
#include <time.h>
#define CUDA_FLOAT float
#define GRID_SIZE 256
#define BLOCK_SIZE 256

13.

// cuda API
void check_cuda_error(const char *message)
{
cudaError_t err = cudaGetLastError();
if(err!=cudaSuccess)
printf("ERROR: %s: %s\n", message, cudaGetErrorString(err));
}

__global__ void pi_kern(CUDA_FLOAT *res)


{
int n = threadIdx.x + blockIdx.x * BLOCK_SIZE;
CUDA_FLOAT x0 = n * 1.f / (BLOCK_SIZE * GRID_SIZE); //

CUDA_FLOAT y0 = sqrtf(1 - x0 * x0);
CUDA_FLOAT dx = 1.f / (1.f * BLOCK_SIZE * GRID_SIZE); //

CUDA_FLOAT s = 0; // ,

CUDA_FLOAT x1, y1;
x1 = x0 + dx;
y1 = sqrtf(1 - x1 * x1);
s = (y0 + y1) * dx / 2.f;
//
res[n] = s;
//
}
int main(int argc, char** argv)
{
cudaSetDevice(DEVICE);//
check_cuda_error("Error selecting device");
CUDA_FLOAT *res_d;
//
CUDA_FLOAT res[GRID_SIZE * BLOCK_SIZE];
//

cudaMalloc((void**)&res_d, sizeof(CUDA_FLOAT) * GRID_SIZE *
BLOCK_SIZE);
// CPU
check_cuda_error("Allocating memory on GPU");
// GPU
dim3 grid(GRID_SIZE);
dim3 block(BLOCK_SIZE);
pi_kern<<<grid, block>>>(res_d);
//
cudaThreadSynchronize();
//
check_cuda_error("Executing kernel");
cudaMemcpy(res, res_d, sizeof(CUDA_FLOAT) * GRID_SIZE * BLOCK_SIZE,
cudaMemcpyDeviceToHost);
//
check_cuda_error("Copying results from GPU");
cudaFree(res_d);
// GPU
check_cuda_error("Freeing device memory");
CUDA_FLOAT pi = 0;
for (int i=0; i < GRID_SIZE * BLOCK_SIZE; i++)
{
pi += res[i];
}
pi *= 4;
printf("PI = %.12f\n",pi);
return 0;

CUDA_FLOAT,
float double,
.
, ,
.
,
.


()
,

,
. 14.

14.


() :

half-warp

32/64/128

, Compute

Capability ( 13):
13.

1.0, 1.1

>= 1.2


32- ,
64-
64- ,
128-
16

k- half-warp
k-


8- ,
32-
16- ,
64-
32- ,
128-

15.

16. 1.0,1.1

>= 1.2 ( 16) ,


.
1.0, 1.1 16
, >= 1.2
, .

.
,
.


, ,
16 .
16
, 32 , ,
.
.
, ,
, .

,
,
, .
__shared__.

__syncthreads();
:
#define BS 256; //
__global__ void kern(float* data){
// float
// :
__shared__ float a[BS];
int idx = blockIdx.x * BS +
threadIdx.x;
//

a[idx] = data[idx];
//

. .
__syncthreads();
//
data[idx] = a[idx]+ a[(idx +
1) % BS];
}

, 17. float
float

32
,

float .
#define BS 256; //
__global__ void kern(short* data){
// float
// :
__shared__ short a[BS];
int idx = blockIdx.x * BS +
threadIdx.x;
//

a[idx] = data[idx];
//

. .
__syncthreads();
//
data[idx] = a[idx]+ a[(idx
+ 1) % BS];
}

18. short

2 , short 16 ,
short .
char, 4 , char 8
.
,
, .

. .

:
WA, WB
BS
( )
BX = blockIdx.x
BY = blockIdx.y
TX = threadIdx.x
TY = threadIdx.y

.

C.

A B.

,

!

19.
#define BLOCK_SIZE 16
__global__ void matMult ( float .
* a, float * b, float * , int wa, int wb )
{
int
bx
= blockIdx.x;
int
by
= blockIdx.y;
int
tx
= threadIdx.x;
int
ty
= threadIdx.y;
float sum = 0.0f;

int
int
int
for (
sum
c [ic

ia = wa * BLOCK_SIZE *
ib = BLOCK_SIZE * bx +
ic = wb * BLOCK_SIZE *
int k = 0; k < n; k++ )
+= a [ia + k] * b [ib +
+ wb * ty + tx] = sum;

by + wa * ty;
tx;
by + BLOCK_SIZE * bx;
k * wb];

, :

2 * WA

2 * WA

, ,
( 20)

20.

, ( 85%)
. .

. .


,

, ,
. ,

:
= A1 * B1 + A2 * B2 + .

.


!

,
,
:

WA / 8

2 * WA

21.
.

( 22):

22.

81% ,
13% ( 20). .

.
,
,
.
__constant__ float contsData [256]; --
contsData
.
cudaMemcpyToSymbol ( constData, hostData, sizeof ( data ), 0,
cudaMemcpyHostToDevice ); --
.

.

.
, ,
,
- ,
,
,
.
.
(
, , ),
.
:

shared


:
texture< type , dim, tex_type> g_TexRef;

Type
Dim (1, 2, 3)
Tex_type
o cudaReadModeNormalizedFloat
o cudaReadModeElementType
,
:
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};

int x, y, z, w; - [0,32]

cudaChannelFormatKind

o cudaChannelFormatKindSigned int
o cudaChannelFormatKindUnsigned
int
o cudaChannelFormatKindFloat float
CUDA cudaArray ( 14).
14. CUDA

cudaArray

1D/ 2D/3D :
1/2/4
8/16/32 bit signed/unsigned integers
32 bit float
16 bit float (driver API)

tex1D()/tex2D()/tex3D()


: tex1Dfetch(tex, int)

.

,
. ,

.

.
:
cudaError_t cudaBindTexture(size_t shift, texref tex,
&src, size_t size));

Shift (
)

Tex

Src

Size

(
, ):
cudaError_t
tex, &src,
pitch);

cudaBindTexture2D(size_t
shift,
texref
&channelDesc, int width,int height,int

Shift (
)

Tex

Src

channelDesc ( )

width

height

pitch

:
cudaError_t cudaUnbindTexture(texref tex);
.
,
:
tex1Dfetch(texRef tex, int index);
Tex

Index

cudaArray .

cudaArray, .
cudaArray , 2
, ,
:

( [W, H] => [0,1])

Clamp

Wrap

( float )

Point

Linear

cudaArray
cudaArray:
cudaArray * a;
:
cudaError_t cudaMallocArray(struct cudaArray **
arrayPtr, const struct cudaChannelFormatDesc *
channelDesc, size_t width, size_t
height);

arrayPtr cudaArray

channelDesc ( )

width

height

:
cudaError_t cudaMemcpyToArray(struct cudaArray * dst,
size_t wOffset, size_t hOffset, const void * src,
size_t count, enum cudaMemcpyKind kind)

arrayPtr cudaArray

wOffset

hOffset

Src ,

count

kind ( cudaMemcpy)

,
cudaArray :
cudaError_t
cudaBindTextureToArray
(const
struct
textureReference *tex, const struct cudaArray *array,
const struct cudaChannelFormatDesc *desc);

Tex

array

channelDesc ( )

cudaArray,


.
,
:
tex1D (texRef tex, float x);
Tex

tex2D (texRef tex, float x, float y);


Tex

x, y

tex3D (texRef tex, float x, float y, float z);


Tex

x, y, z

(
23).

23.

()

- :

x1 y1 z1
x2 y2 z2
x3 y3 z3
..

1000

-- 5


, CUDA,

. .
,
,

, ,
. .
2
: -. 3
.

,
1 ,
.
.
().
:
, , ,

: .
:
: {0,1}
0 1
6 7
1 1 4, 5,
6 7
0
:
WX WY WZ T
w1 x1 y1 z1
...
wm xm ym zm

0000
WX, WY, WZ ,
T :
1
wi i- ( 1)
xi, yi, zi i- ,
:
./life
test.in
100

test.out

%
5 == 0

:
w11 x11 y11 z11

w1m x1m y1m z1m


0000
...
wn1 xn1 yn1 zn1

wnk xnk ynk znk


0000
wji i- j- (1) xji, yji, zji i-
j-
:
20 20 20 1
1575
1585
1685
1755
1855
1865
1576
1586
1686
1756
1856
1866

0000

, CUDA
,
.
,
.
, ,
.
,
, (
).
. ,
( ), ,
, .