CUDA.
GPU. CUDA C
..
GPGPU
GPU?
GPU
UDA
. , 2011 .
GPGPU
. , 2011 .
GPGPU
. , 2011 .
GPGPU
.
.
.
, .
CUDA:
http://www.nvidia.com/object/cuda_showcase_html.html
. , 2011 .
(heterogeneous computing)
.
:
(CPU);
APU
(GPU);
(, DSP);
;
.
CPU + GPU.
, GPGPU , .
. , 2011 .
GPU
GPU:
:
HLSL
GLSL
Cg
. , 2011 .
:
NVIDIA CUDA
AMD Stream
(
)
:
OpenCL
C++ AMP
()
NVIDIA CUDA
. , 2011 .
NVIDIA CUDA
CUDA
4.0:
http://developer.nvidia.com/cuda-toolkit-40
CUDA driver.
CUDA toolkit.
;
;
;
.
GPU Computing SDK.
.
. , 2011 .
10
GPU?
. , 2011 .
11
CPU GPU
12
CPU GPU
13
GPU
3 5
TOP500 2011
GPU:
: top500.org
. , 2011 .
14
GPU
3 5
Green500 2011
GPU:
: green500.org
. , 2011 .
15
GPU
. , 2011 .
16
CPU GPU
CPU
cache-oriented
GPU
cache-miss oriented
17
CPU GPU
GPU ,
:
,
.
.
.
GPU .
. , 2011 .
18
GPU:
GPU - .
(streaming multiprocessor,
MP), CUDA-
(CUDA core) .
Fermi CUDA-
(scalar
processor, SP).
CUDA-
SIMD.
,
.
. , 2011 .
19
Tesla 8/10
: .. , ..
-
. , 2011 .
20
Tesla 8
: .. , ..
-
. , 2011 .
21
Tesla 10
: .. , ..
-
. , 2011 .
22
Tesla 10
(device/global) .
(shared) SP MP.
(constant cache) ,
SP MP).
(texture cache) ,
SP MP).
(register) () SP.
(local) () SP.
. , 2011 .
23
Fermi
: .. , ..
-
. , 2011 .
24
Fermi
NVIDIA.
L2- .
L1-
.
,
L1- 48kB/16kB.
.
SFU.
C++.
ECC.
. , 2011 .
25
(compute capability)
.
,
(major)
(minor) , major.minor.
, 1.3.
(
CUDA-)
Appendix A NVIDIA CUDA C Programming Guide.
Fermi 2
( 2.x),
1 ( 1.x).
. , 2011 .
26
Appendix G NVIDIA CUDA C
Programming Guide.
1.0.
, 1.3.
(C++), 2.0.
++ 2.0
, CUDA 4.0.
.
. , 2011 .
27
. , 2011 .
28
(thread)
(kernel).
Fermi 4
.
(thread blocks).
,
CUDA- .
/ (grid).
.
.
. , 2011 .
29
( ) ,
.
Block ID (1D, 2D 3D).
CUDA 4.0.
Thread ID (1D, 2D 3D).
.
: ,
.
, , x y-
.
. , 2011 .
30
31
,
:
;
.
.
.
. , 2011 .
32
.
.
,
.
.
. , 2011 .
33
34
.
,
.
: .. , ..
-
. , 2011 .
35
CUDA C
. , 2011 .
36
CUDA C
CUDA C /C++,
;
;
.
C++. C++
CUDA.
:
(host) = CPU;
(device) = GPU;
(kernel) ,
GPU.
. , 2011 .
37
__host__
__global__
host
device
host
host
__device__
device
device
__host__ ( ) ,
.
__global__ ,
().
__device__ , ( )
.
. , 2011 .
38
__global__
void.
/
.
.
.
.
__global__
.
. , 2011 .
39
__device__
__device__ __host__,
2 .
.
.
2.0 .
__device__
2.0 ,
.
1.x.
. , 2011 .
40
__device__ ,
:
;
;
, ,
.
__constant__ ,
.
. , 2011 .
41
__shared__ ,
,
;
;
, .
. , 2011 .
42
. , 2011 .
43
GPU :
gridDim dim3,
;
blockIdx uint3,
;
blockDim dim3,
;
threadIdx uint3,
.
.
. , 2011 .
44
(.. ).
.
,
x- .
idx = blockIdx.x * blockDim.x + threadIdx.x;
. , 2011 .
45
CUDA API
CUDA API:
;
;
;
;
API.
cudaError_t,
cudaSuccess .
API:
(CUDA driver API): cu*;
(C runtime for CUDA): cuda*.
. , 2011 .
46
:
cudaError_t cudaGetDeviceCount(int* count)
;
cudaError_t cudaGetDevice (int* dev)
;
cudaError_t cudaGetDeviceProperties (struct
cudaDeviceProp* prop, int dev) ,
.
. , 2011 .
47
:
cudaError_t cudaSetDevice (int dev)
;
cudaError_t cudaChooseDevice (int* dev, const struct
cudaDeviceProp* prop) ,
.
. , 2011 .
48
:
cudaError_t cudaMalloc (void** devPtr, size_t count)
;
cudaError_t cudaFree (void* devPtr)
.
:
cudaError_t cudaMemcpy (void* dst, const void* src,
size_t count, enum cudaMemcpyKind kind)
( );
cudaMemcpyAsync .
. , 2011 .
49
.
.
<<< Dg, Db, Ns, S >>>
.
Dg , Dg.x
* Dg.y * Dg.z ,
.
Db
, Db.x * Db.y * Db.z .
Ns, S ,
.
. , 2011 .
50
:
__global__ void kernel_func() { }
. , 2011 .
51
void __syncthreads()
( ).
.
cudaThreadSyncronize()
( ).
.
cudaStream_t cudaEvent_t.
. , 2011 .
52
. , 2011 .
53
54
nvcc.
Build rules Microsoft Visual Studio.
CUDA 4.0 MSVS 2005 2008.
CUDA 4.0 MSVS 2010.
. , 2011 .
55
CUDA
/C++ CPU
NVCC
CPU
CUDA
CPU
CPU-GPU
. , 2011 .
56
. , 2011 .
57