±à¼ÍƼö: |
±¾ÎÄÀ´×ÔÓÚcsdn£¬Éî¶ÈѧϰµÄÐËÆð£¬Ê¹µÃ¶àÏß³ÌÒÔ¼°GPU±à³ÌÖð½¥³ÉΪËã·¨¹¤³ÌʦÎÞ·¨¹æ±ÜµÄÎÊÌâ¡£ÕâÀïÖ÷Òª¼Ç¼×Ô¼ºµÄGPU×ÔѧÀú³Ì¡£ |
|
¡¶GPU±à³Ì×Ôѧ1 ¡ª¡ª ÒýÑÔ¡·
¡¶GPU±à³Ì×Ôѧ2 ¡ª¡ª CUDA»·¾³ÅäÖá·
¡¶GPU±à³Ì×Ôѧ3 ¡ª¡ª CUDA³ÌÐò³õ̽¡·
¡¶GPU±à³Ì×Ôѧ4 ¡ª¡ª CUDAºËº¯ÊýÔËÐвÎÊý¡·
¡¶GPU±à³Ì×Ôѧ5 ¡ª¡ª Ïß³ÌÐ×÷¡·
¡¶GPU±à³Ì×Ôѧ6 ¡ª¡ª º¯ÊýÓë±äÁ¿ÀàÐÍÏÞ¶¨·û¡·
¡¶GPU±à³Ì×Ôѧ7 ¡ª¡ª ³£Á¿ÄÚ´æÓëʼþ¡·
¡¶GPU±à³Ì×Ôѧ8 ¡ª¡ª ÎÆÀíÄÚ´æ¡·
¡¶GPU±à³Ì×Ôѧ9 ¡ª¡ª Ô×Ó²Ù×÷¡·
¡¶GPU±à³Ì×Ôѧ10 ¡ª¡ª Á÷²¢ÐС·
Ò»¡¢ ÒýÑÔ
´«Í³µÄÖÐÑë´¦ÀíÆ÷£¨CPU£¬Central Processing Unit) ÄÚ²¿½á¹¹Òì³£¸´ÔÓ£¬Ö÷ÒªÊÇÒòΪÆäÐèÒªºÜÇ¿µÄͨÓÃÐÔÀ´´¦Àí¸÷ÖÖ²»Í¬µÄÊý¾ÝÀàÐÍ£¬Í¬Ê±ÓÖÒªÂß¼ÅжÏÓÖ»áÒýÈë´óÁ¿µÄ·ÖÖ§Ìø×ªºÍÖжϵĴ¦Àí¡£
ΪÁËÌá¸ß¼ÆËãÄÜÁ¦£¬CPUͨ³£»á²ÉÈ¡Ìá¸ßʱÖÓÆµÂÊ»òÔö¼Ó´¦ÀíÆ÷ºËÊýÁ¿µÄ²ßÂÔ¡£
ΪÁ˽øÒ»²½»ñµÃ¸ü¸ßЧµÄ¼ÆË㣬ͼÐδ¦ÀíÆ÷£¨GPU, Graphics Processing Unit£©Ó¦Ô˶øÉú¡£
GPU¿ÉÒÔÔÚÎÞÐèÖжϵĴ¿¾»»·¾³Ï´¦ÀíÀàÐ͸߶ÈͳһµÄ¡¢Ï໥ÎÞÒÀÀµµÄ´ó¹æÄ£Êý¾Ý¡£
ÈçÏÂͼËùʾ£º

GPUµÄ¸ßЧÔÚÓÚ¿ÉÒԸ߶Ȳ¢Ðд¦Àí¡£ ÒÔÁ½¸öÏòÁ¿Ïà¼ÓΪÀý£¬CPU¿ÉÄܲÉȡѻ·´¦Àí£¬Ã¿¸öÑ»·¶ÔÒ»¸ö·ÖÁ¿×ö¼Ó·¨¡£GPUÔò¿ÉÒÔ¿ª¶à¸öỊ̈߳¬Ã¿¸öÏß³Ìͬʱ¶ÔÒ»¸ö·ÖÁ¿×ö¼Ó·¨¡£CPU¼Ó·¨µÄËÙ¶ÈÒ»°ã¿ìÓÚGPU£¬µ«ÒòΪGPU¿ÉÒÔͬʱ¿ª´óÁ¿Ï̲߳¢ÐÐÅÜ£¬Òò´Ë¸ü¼Ó¸ßЧ¡£
ΪÁ˽µµÍGPU³ÌÐòµÄ¿ª·¢ÄѶȣ¬NVIDIAÍÆ³öÁË CUDA£¨Compute Unified Device
Architecture£¬Í³Ò»¼ÆËãÉ豸¼Ü¹¹£©ÕâÒ»±à³ÌÄ£ÐÍ¡£
¶þ¡¢ CUDA»·¾³ÅäÖÃ
Ê×ÏÈ˵Ã÷Ò»ÏÂÎҵĻù´¡»·¾³£º ÁªÏëСг¬¼«±¾£»Win10 X64 רҵ°æ£» NVIDIA GeForce
940MX; VS2013¡£
2.1 °²×°CUDA Toolkit
ÔÚ±£Ö¤NVIDIAÏÔ¿¨Çý¶¯³É¹¦°²×°µÄÌõ¼þÏ£¬´ÓÏÂÃæÁ´½ÓÏÂÔØ²¢°²×°¶ÔÓ¦°æ±¾µÄCUDA Toolkit.£¨×¢Ò⣺×îºÃÒѾ°²×°ºÃVS£©
https://developer.nvidia.com/cuda-downloads¡£ ½¨ÒéÓÒ¼ü¸´ÖÆÏÂÔØÁ´½ÓÈ»ºóѸÀ×ÏÂÔØ¡£

ͨ¹ýÔÚÃüÁî´°ÖÐÖ´ÐÐ nvcc -V³õ²½ÅжÏÊÇ·ñ°²×°³É¹¦£º

°²×°³É¹¦ºó(ĬÈϰ²×°)ϵͳ»áÔö¼ÓÈçÏ»·¾³±äÁ¿£º
CUDA_PATH£º C:\Program
Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0
CUDA_PATH_V8_0£º C:\Program Files\NVIDIA GPU Computing
Toolkit\CUDA\v8.0
NUMBER_OF_PROCESSORS£º 4
NVCUDASAMPLES_ROOT£º C:\ProgramData\NVIDIA Corporation\CUDA
Samples\v8.0
NVCUDASAMPLES8_0_ROOT£º C:\ProgramData\NVIDIA Corporation\CUDA
Samples\v8.0
NVTOOLSEXT_PATH£º C:\Program Files\NVIDIA Corporation\NvToolsExt\ |
2.2 VS²âÊÔ¹¤³Ì
CUDA Toolkit°²×°³É¹¦ºó»á×Ô¶¯ºÍϵͳµÄ±àÒëÆ÷½øÐа󶨡£ ÒÔÎÒµÄVS2013ΪÀý£¬¡°Ð½¨ÏîÄ¿¡±ÏÂÔö¼ÓÁË
¡°NVIDIA¡±Ñ¡Ïî¡£

CUDA ToolkitÒѾΪÎÒÃÇÌṩÁËһЩ¼òµ¥µÄÑùÀý£¬Î»ÓÚ »·¾³±äÁ¿ ¡°NVCUDASAMPLES_ROOT¡±ËùÖ¸ÏòµÄĿ¼Ï¡£
×¢Ò⣬¸ÃĿ¼ͨ³£ÎªÒþ²ØÄ¿Â¼¡£

Ëæ±ãÑ¡ÔñÆäÖеÄÒ»¸ö×ÓÏîÄ¿£¬Èç¹û¿ÉÒԳɹ¦ÔËÐУ¬Ôò±íÃ÷CUDAȷʵÒѾ°²×°³É¹¦¡£
Èý¡¢ CUDA³ÌÐò³õ̽
3.1 Ö÷»úÓëÉ豸
ͨ³£½«CPU¼°ÆäÄÚ´æ³ÆÖ®ÎªÖ÷»ú£¬GPU¼°ÆäÄÚ´æ³ÆÖ®ÎªÉ豸¡£
ÈçÏÂͼËùʾ£¬Ð½¨Ò»¸öNVIDIA CUDA¹¤³Ì£¬²¢ÃüÃûΪ ¡°1-helloworld¡±

Ö®ºó·¢ÏÖÏîÄ¿Àï¶àÁËÒ»¸ö ¡°kernel.cu¡±µÄÎļþ£¬¸ÃÎļþÄÚÈÝÊÇÒ»¸ö¾µäµÄ ʸÁ¿Ïà¼Ó µÄGPU³ÌÐò¡£
¿ÉÒÔÔÝʱȫ²¿×¢Ê͸ôúÂ룬²¢³¢ÊÔ±àÒëÔËÐÐÏÂÃæµÄÎÒÃǾ³£¼ûµ½µÄ±à³ÌÈëÃÅʾÀý£º
#include <iostream>
int main()
{
std::cout<<"Hello, World!"<<std::endl;
system("pause");
return 0;
} |
Õâ¿´ÆðÀ´ºÍÆÕͨµÄC++³ÌÐò²¢Ã»Ê²Ã´Çø±ð¡£ Õâ¸öʾÀýÖ»ÊÇΪÁË˵Ã÷CUDA C±à³ÌºÍÎÒÃÇÊìϤµÄ±ê×¼CÔںܴó³Ì¶ÈÉÏÊÇûÓÐÇø±ðµÄ¡£
ͬʱ£¬Õâ¶Î³ÌÐòÖ±½ÓÔËÐÐÔÚ Ö÷»úÉÏ¡£
½ÓÏÂÀ´£¬ÎÒÃÇ¿´¿´ÈçºÎʹÓÃGPUÀ´Ö´ÐдúÂë¡£ÈçÏ£º
#include <iostream>
__global__ void mkernel(void){}
int main()
{
mkernel <<<1,1>>>();
std::cout<<"Hello, World!"<<std::endl;
system("pause");
return 0;
} |
Óë֮ǰµÄ´úÂëÏà±È£¬ ÕâÀïÖ÷ÒªÔö¼ÓÁË
Ò»¸ö¿ÕµÄº¯Êýmkernel()£¬ ²¢´øÓÐÐÞÊηû global
¶Ô¿Õº¯ÊýµÄµ÷Ó㬠²¢´øÓÐÐÞÊηû <<<1,1>>>
_global_ ΪCUDA CΪ±ê×¼CÔö¼ÓµÄÐÞÊηû£¬±íʾ¸Ãº¯Êý½«»á½»¸ø±àÒëÉ豸´úÂëµÄ±àÒëÆ÷(NVCC)²¢×îÖÕÔÚÉ豸ÉÏÔËÐС£
¶ø mainº¯ÊýÔòÒÀ¾É½»¸øÏµÍ³±àÒëÆ÷(VS2013)¡£
Æäʵ£¬CUDA¾ÍÊÇͨ¹ýÖ±½ÓÌṩAPI½Ó¿Ú»òÕßÔÚÓïÑÔ²ãÃæ¼¯³ÉһЩÐµĶ«Î÷À´ÊµÏÖÔÚÖ÷»ú´úÂëÖе÷ÓÃÉ豸´úÂë¡£
3.2 µÚÒ»¸öGPU³ÌÐò£º ʸÁ¿Ïà¼Ó
ÏÂÃæÖ÷Ҫͨ¹ý´úÂë½â¶ÁµÄÐÎʽÀ´½øÐÐÎÒÃǵĵÚÒ»¸öGPU³ÌÐò¡£
³ÌÐò×ñÑÒÔÏÂÁ÷³Ì£º
Ö÷»ú¶Ë×¼±¸Êý¾Ý -> Êý¾Ý¸´ÖƵ½GPUÄÚ´æÖÐ -> GPUÖ´Ðк˺¯Êý -> Êý¾ÝÓÉGPUÈ¡»Øµ½Ö÷»ú
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
// ½Ó¿Úº¯Êý£º Ö÷»ú´úÂëµ÷ÓÃGPUÉ豸ʵÏÖʸÁ¿¼Ó·¨ c = a + b
cudaError_t addWithCuda(int *c, const int *a,
const int *b, unsigned int size);
// ºËº¯Êý£ºÃ¿¸öÏ̸߳ºÔðÒ»¸ö·ÖÁ¿µÄ¼Ó·¨
__global__ void addKernel(int *c, const int
*a, const int *b)
{
int i = threadIdx.x; // »ñÈ¡Ïß³ÌID
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50
};
int c[arraySize] = { 0 };
// ²¢ÐÐʸÁ¿Ïà¼Ó
cudaError_t cudaStatus = addWithCuda(c, a, b,
arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50}
= {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// CUDAÉè±¸ÖØÖã¬ÒÔ±ãÆäËüÐÔÄܼì²âºÍ¸ú×Ù¹¤¾ßµÄÔËÐУ¬ÈçNsight and Visual
Profiler to show complete traces.traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// ½Ó¿Úº¯ÊýʵÏÖ£º Ö÷»ú´úÂëµ÷ÓÃGPUÉ豸ʵÏÖʸÁ¿¼Ó·¨ c = a + b
cudaError_t addWithCuda(int *c, const int *a,
const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Ñ¡Ôñ³ÌÐòÔËÐÐÔÚÄÄ¿éGPUÉÏ£¬(¶àGPU»úÆ÷¿ÉÒÔÑ¡Ôñ)
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed!
Do you have a CUDA-capable GPU installed?");
goto Error;
}
// ÒÀ´ÎΪ c = a + bÈý¸öʸÁ¿ÔÚGPUÉÏ¿ª±ÙÄÚ´æ .
cudaStatus = cudaMalloc((void**)&dev_c,
size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a,
size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b,
size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// ½«Ê¸Á¿aºÍbÒÀ´Îcopy½øÈëGPUÄÚ´æÖÐ
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// ÔËÐк˺¯Êý£¬ÔËÐÐÉèÖÃΪ1¸öblock£¬Ã¿¸öblockÖÐsize¸öÏß³Ì
addKernel<<<1, size>>>(dev_c,
dev_a, dev_b);
// ¼ì²éÊÇ·ñ³öÏÖÁË´íÎó
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed:
%s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Í£Ö¹CPU¶ËÏ̵߳ÄÖ´ÐУ¬Ö±µ½GPUÍê³É֮ǰCUDAµÄÈÎÎñ£¬°üÀ¨kernelº¯Êý¡¢Êý¾Ý¿½±´µÈ
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize
returned error code %d after launching addKernel!\n",
cudaStatus);
goto Error;
}
// ½«¼ÆËã½á¹û´ÓGPU¸´ÖƵ½Ö÷»úÄÚ´æ
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
} |
ËÄ¡¢ CUDAºËº¯ÊýÔËÐвÎÊý
ÔÚÇ°ÃæµÄÕ½ÚÖУ¬ÎÒÃDz»Ö¹Ò»´Î¿´µ½ÁËÔÚµ÷Óö¨ÒåµÄºËº¯Êýʱ²ÉÓÃÁËÀàËÆÏÂÃæµÄÐÎʽ£º
kernel<<<1,1>>>(param1,param2,...) |
¡°<<< >>>¡±ÖвÎÊýµÄ×÷ÓÃÊǸæËßÎÒÃǸÃÈçºÎÆô¶¯ºËº¯Êý(±ÈÈçÈçºÎÉèÖÃÏß³Ì)¡£
ÏÂÃæÎÒÃÇÏÈÖ±½Ó½éÉܲÎÊý¸ÅÄȻºóÏêϸ˵Ã÷ÆäÒâÒå¡£
4.1 ºËº¯ÊýÔËÐвÎÊý
µ±ÎÒÃÇʹÓà gloabl ÉùÃ÷ºËº¯Êýºó
__global__ void
kernel(param list){ } |
ÔÚÖ÷»ú¶Ë(Host)µ÷ÓÃʱ²ÉÓÃÈçϵÄÐÎʽ£º
kernel<<<Dg,Db,
Ns, S>>>(param list); |
Dg£º intÐÍ»òÕßdim3ÀàÐÍ(x,y,z)¡£ ÓÃÓÚ¶¨ÒåÒ»¸ögridÖеÄblockÊÇÈçºÎ×éÖ¯µÄ¡£
intÐÍÔòÖ±½Ó±íʾΪ1ά×éÖ¯½á¹¹¡£
Db£º intÐÍ»òÕßdim3ÀàÐÍ(x,y,z)¡£ ÓÃÓÚ¶¨ÒåÒ»¸öblockÖеÄthreadÊÇÈçºÎ×éÖ¯µÄ¡£
intÐÍÔòÖ±½Ó±íʾΪ1ά×éÖ¯½á¹¹¡£
Ns£º size_tÀàÐÍ£¬¿Éȱʡ£¬Ä¬ÈÏΪ0¡£ ÓÃÓÚÉèÖÃÿ¸öblock³ýÁ˾²Ì¬·ÖÅäµÄ¹²ÏíÄÚ´æÍ⣬×î¶àÄܶ¯Ì¬·ÖÅäµÄ¹²ÏíÄÚ´æ´óС£¬µ¥Î»Îªbyte¡£
0±íʾ²»ÐèÒª¶¯Ì¬·ÖÅä¡£
S£º cudaStream_tÀàÐÍ£¬¿Éȱʡ£¬Ä¬ÈÏΪ0¡£ ±íʾ¸ÃºËº¯ÊýλÓÚÄĸöÁ÷¡£
4.2 Ï߳̽ṹ
¹ØÓÚCUDAµÄÏ߳̽ṹ£¬ÓÐ×ÅÈý¸öÖØÒªµÄ¸ÅÄ Grid, Block, Thread
GPU¹¤×÷ʱµÄ×îСµ¥Î»ÊÇ thread¡£
¶à¸ö thread ¿ÉÒÔ×é³ÉÒ»¸ö block£¬µ«Ã¿Ò»¸ö block ËùÄܰüº¬µÄ thread ÊýÄ¿ÊÇÓÐÏ޵ġ£ÒòΪһ¸öblockµÄËùÓÐÏß³Ì×îºÃÓ¦µ±Î»ÓÚͬһ¸ö´¦ÀíÆ÷ºËÐÄÉÏ£¬Í¬Ê±¹²Ïíͬһ¿éÄÚ´æ¡£
ÓÚÊÇÒ»¸ö blockÖеÄËùÓÐthread¿ÉÒÔ¿ìËÙ½øÐÐͬ²½µÄ¶¯×÷¶ø²»Óõ£ÐÄÊý¾ÝͨÐűÚÀÝ¡£
Ö´ÐÐÏàͬ³ÌÐòµÄ¶à¸ö block£¬¿ÉÒÔ×é³É grid¡£ ²»Í¬ block ÖÐµÄ thread ÎÞ·¨´æÈ¡Í¬Ò»¿é¹²ÏíµÄÄڴ棬ÎÞ·¨Ö±½Ó»¥Í¨»ò½øÐÐͬ²½¡£Òò´Ë£¬²»Í¬
block ÖÐµÄ thread ÄܺÏ×÷µÄ³Ì¶ÈÊDZȽϵ͵ġ£²»¹ý£¬ÀûÓÃÕâ¸öģʽ£¬¿ÉÒÔÈóÌÐò²»Óõ£ÐÄÏÔʾоƬʵ¼ÊÉÏÄÜͬʱִÐеÄ
thread ÊýÄ¿ÏÞÖÆ¡£ÀýÈ磬һ¸ö¾ßÓкÜÉÙÁ¿Ö´Ðе¥ÔªµÄÏÔʾоƬ£¬¿ÉÄÜ»á°Ñ¸÷¸ö block ÖÐµÄ thread
˳ÐòÖ´ÐУ¬¶ø·ÇͬʱִÐС£²»Í¬µÄ grid Ôò¿ÉÒÔÖ´Ðв»Í¬µÄ³ÌÐò(¼´ kernel)¡£
ÏÂͼÊÇÒ»¸ö½á¹¹¹ØÏµÍ¼£º

´ËÍ⣬Block, ThreadµÄ×éÖ¯½á¹¹¿ÉÒÔÊÇ¿ÉÒÔÊÇһά£¬¶þά»òÕßÈýά¡£ÒÔÉÏͼΪÀý£¬Block,
ThreadµÄ½á¹¹·Ö±ðΪ¶þάºÍÈýά¡£
CUDAÖÐÿһ¸öÏ̶߳¼ÓÐÒ»¸öΨһ±êʶThreadIdx£¬Õâ¸öIDËæ×Å×éÖ¯½á¹¹ÐÎʽµÄ±ä»¯¶ø±ä»¯¡£ (×¢Ò⣺IDµÄ¼ÆË㣬ͬ¼ÆËãÐÐÓÅÏÈÅÅÁеľØÕóÔªËØID˼·һÑù¡£)
»Ø¹Ë֮ǰÎÒÃǵÄʸÁ¿¼Ó·¨£º
// BlockÊÇһάµÄ£¬ThreadÒ²ÊÇһάµÄ
__global__ void addKernel(int *c, const int *a,
const int *b)
{
int i = blockIdx.x *blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
} |
// BlockÊÇһάµÄ£¬ThreadÊǶþάµÄ
__global__ void addKernel(int *c, int *a, int
*b)
{
int i = blockIdx.x * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
} |
// BlockÊǶþάµÄ£¬ThreadÊÇÈýάµÄ
__global__ void addKernel(int *c, int *a, int
*b)
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int i = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x) + threadIdx.x;
c[i] = a[i] + b[i];
} |
ϱíÊDz»Í¬¼ÆËãÄÜÁ¦µÄGPUµÄ¼¼ÊõÖ¸±ê(¸ü¶à¿É²Î¼û CUDA Toolkit Documentation)

µ±È»Ò²¿ÉÒÔͨ¹ýÏÂÃæµÄ´úÂëÀ´Ö±½Ó²éѯ×Ô¼ºGPUµÄ¾ßÌåÖ¸±ê£º
#include "cuda_runtime.h"
#include <iostream>
int main()
{
cudaError_t cudaStatus;
// ³õ»ñÈ¡É豸ÊýÁ¿
int num = 0;
cudaStatus = cudaGetDeviceCount(&num);
std::cout << "Number of GPU: "
<< num << std::endl;
// »ñÈ¡GPUÉ豸ÊôÐÔ
cudaDeviceProp prop;
if (num > 0)
{
cudaGetDeviceProperties(&prop, 0);
// ´òÓ¡É豸Ãû³Æ
std::cout << "Device: " <<prop.name
<< std::endl;
}
system("pause");
return 0;
} |
ÆäÖÐ cudaDevicePropÊÇÒ»¸ö¶¨ÒåÔÚdriver_types.hÖеĽṹÌ壬´ó¼Ò¿ÉÒÔ×ÔÐв鿴Æä¶¨Òå¡£
4.3 ÄÚ´æ½á¹¹
ÈçÏÂͼËùʾ,ÿ¸ö thread ¶¼ÓÐ×Ô¼ºµÄÒ»·Ý register ºÍ local memory µÄ¿Õ¼ä¡£Í¬Ò»¸ö
block ÖеÄÿ¸ö thread ÔòÓй²ÏíµÄÒ»·Ý share memory¡£´ËÍ⣬ËùÓÐµÄ thread(°üÀ¨²»Í¬
block µÄ thread)¶¼¹²ÏíÒ»·Ý global memory¡¢constant memory¡¢ºÍ
texture memory¡£²»Í¬µÄ grid ÔòÓи÷×﵀ global memory¡¢constant
memory ºÍ texture memory¡£
ÕâÖÖÌØÊâµÄÄÚ´æ½á¹¹Ö±½ÓÓ°Ïì×ÅÎÒÃǵÄÏ̷߳ÖÅä²ßÂÔ£¬ÒòΪÐèҪͨÅÌ¿¼ÂÇ×ÊÔ´ÏÞÖÆ¼°ÀûÓÃÂÊ¡£ ÕâЩºóÐøÔÙ½øÐÐÌÖÂÛ¡£

4.4 Òì¹¹±à³Ì
ÈçÏÂͼËùʾ£¬Êdz£¼ûµÄGPU³ÌÐòµÄ´¦ÀíÁ÷³Ì£¬ÆäʵÊÇÒ»ÖÖÒì¹¹³ÌÐò£¬¼´CPUºÍGPUµÄÐͬ¡£
Ö÷»úÉÏÖ´Ðд®ÐдúÂ룬É豸ÉÏÔòÖ´Ðв¢ÐдúÂë¡£

Îå¡¢ Ïß³ÌÐ×÷
5.1 ²¢ÐгÌÐò¿éµÄ·Ö½â
Ê×ÏȻعËÎÒÃÇ֮ǰʵÏÖµÄʸÁ¿Ïà¼Ó³ÌÐò£º
// ºËº¯Êý£ºÃ¿¸öÏ̸߳ºÔðÒ»¸ö·ÖÁ¿µÄ¼Ó·¨
__global__ void addKernel(int *c, const int *a,
const int *b)
{
int i = threadIdx.x; // »ñÈ¡Ïß³ÌID
c[i] = a[i] + b[i];
}
// ÔËÐк˺¯Êý£¬ÔËÐÐÉèÖÃΪ1¸öblock£¬Ã¿¸öblockÖÐsize¸öÏß³Ì
addKernel << <1, size >> >(dev_c,
dev_a, dev_b); |
ͨ¹ýÇ°ÃæÐ¡½Ú£¬ÎÒÃÇÖªµÀÒ»¸öBlockÖеĿɿª±ÙµÄÏß³ÌÊýÁ¿ÊÇÓÐÏÞµÄ(²»³¬¹ý1024)¡£
Èç¹ûʸÁ¿Ìر𳤣¬ÉÏÃæµÄ²Ù×÷ÊÇ»á³öÏÖÎÊÌâµÄ¡£ÓÚÊÇÎÒÃÇ¿ÉÒÔ²ÉÓöà¸öÏ߳̿é(Block)À´½â¾öÏ̲߳»×ãµÄÎÊÌâ¡£
¼ÙÈçÎÒÃÇÉ趨ÿ¸öÏ߳̿é°üº¬128¸öỊ̈߳¬ÔòÐèÒªµÄÏ߳̿éµÄÊýÁ¿Îª size / 128¡£ ΪÁ˱ÜÃâ²»ÄÜÕû³ý´øÀ´µÄÎÊÌ⣬ÎÒÃÇ¿ÉÒÔÉÔ΢¶à¿ªÒ»µã
(size + 127) / 128£¬µ«ÐèÒªÔö¼ÓÅжÏÌõ¼þÀ´±ÜÃâÔ½½ç¡£
// ºËº¯Êý£ºÃ¿¸öÏ̸߳ºÔðÒ»¸ö·ÖÁ¿µÄ¼Ó·¨
__global__ void addKernel(int *c, const int *a,
const int *b, const int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// »ñÈ¡Ïß³ÌID
if (i < size)
c[i] = a[i] + b[i];
}
// ÔËÐк˺¯Êý£¬ÔËÐÐÉèÖÃΪ¶à¸öblock£¬Ã¿¸öblockÖÐ128¸öÏß³Ì
addKernel <<<(size + 127) / 128, 128
>>>(dev_c, dev_a, dev_b, size); |
ͨ¹ýÇ°ÃæÐ¡½Ú£¬ÎÒÃÇͬʱҲ֪µÀÒ»¸öGridÖпɿª±ÙµÄBlockÊýÁ¿Ò²ÊÇÓÐÏ޵ġ£
Èç¹ûÊý¾ÝÁ¿´óÓÚ Block_num * Thread_num£¬ÄÇôÎÒÃǾÍÎÞ·¨ÎªÃ¿¸ö·ÖÁ¿µ¥¶À·ÖÅäÒ»¸öÏß³ÌÁË¡£
²»¹ý£¬Ò»¸ö¼òµ¥µÄ½â¾ö°ì·¨¾ÍÊÇÔں˺¯ÊýÖÐÔö¼ÓÑ»·¡£
// ºËº¯Êý£ºÃ¿¸öÏ̸߳ºÔð¶à¸ö·ÖÁ¿µÄ¼Ó·¨
__global__ void addKernel(int *c, const int *a,
const int *b, const int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
while (i < size)
{
c[i] = a[i] + b[i];
// Æ«ÒÆ·ÖÁ¿µÈÓÚÒ»¸öGridÖаüº¬µÄÏß³ÌÊýÁ¿
i += blockDim.x * gridDim.x;
}
}
// ÔËÐк˺¯Êý£¬ÔËÐÐÉèÖÃΪ1¸öGrid°üº¬128¸öblock£¬Ã¿¸öblock°üº¬128¸öÏß³Ì
// ÆäÖÐÒѾ¼ÙÉè size > 128*128
addKernel <<<128, 128 >>>(dev_c,
dev_a, dev_b, size); |
5.2 ¹²ÏíÄÚ´æÓëͬ²½
ÉÏÃæÌáµ½Ï߳̿éµÄ·Ö½âËÆºõÊÇΪÁËÔö¼Ó¿ÉÓõÄÏß³ÌÊýÁ¿£¬µ«ÕâÖÖ˵·¨²¢²»¿¿Æ×£¬ÒòΪÕâÍêÈ«¿ÉÒÔÓÉCUDAÔÚÄ»ºóȫȨ¿ØÖÆ¡£
Æäʵ£¬·Ö½âÏ߳̿éµÄÖØÒªÔÒòÊÇÒòΪÄÚ´æ¡£
ÔÚ¡°4.3 ÄÚ´æ½á¹¹¡±ÖÐÎÒÃÇÒѾ֪µÀ£¬Í¬Ò»¸öBlockÖеÄÏ߳̿ÉÒÔ·ÃÎÊÒ»¿é¹²ÏíÄÚ´æ¡£ÓÉÓÚ¹²ÏíÄڴ滺³åÇø×¤ÁôÔÚÎïÀíGPUÉÏ£¬¶ø²»ÊÇGPUÖ®ÍâµÄϵͳÄÚ´æÉÏ£¬Òò´Ë·ÃÎʹ²ÏíÄÚ´æµÄÑÓ³ÙÒªÔ¶Ô¶µÍÓÚ·ÃÎÊÆÕͨ»º³åÇøµÄÑÓ³Ù¡£
²»Í¬BlockÖ®¼ä´æÔÚ¸ôÀ룬Èç¹ûÎÒÃÇÐèÒª²»Í¬Ïß³ÌÖ®¼ä½øÐÐͨÐÅ£¬ÄÇô»¹ÐèÒª¿¼ÂÇÏß³Ìͬ²½µÄÎÊÌâ¡£±ÈÈçÏß³ÌA½«Ä³¸öÊýֵдÈëÄڴ棬ȻºóÏß³ÌB»á¶Ô¸ÃÊýÖµ½øÐÐһЩ²Ù×÷£¬ÄÇôÏÔÈ»±ØÐëµÈAÍê³ÉÖ®ºóB²Å¿ÉÒÔ²Ù×÷£¬Èç¹ûûÓÐͬ²½£¬³ÌÐò½«»áÒò½øÈë¡°¾ºÌ¬Ìõ¼þ¡±¶ø²úÉúÒâÏë²»µ½µÄ´íÎó¡£
½ÓÏÂÀ´ÎÒÃÇͨ¹ýÒ»¸öʸÁ¿µã»ýµÄÀý×ÓÀ´ËµÃ÷ÉÏÊöÎÊÌâ¡£
ʸÁ¿µã»ýµÄ¶¨ÒåÈçÏ£º
(x1,x2,x3,x4)?(y1,y2,y3,y4)=x1y1+x2y2+x3y3+x4y4(x1,x2,x3,x4)?(y1,y2,y3,y4)=x1y1+x2y2+x3y3+x4y4
ÓÉÉÏÃæµÄ¶¨ÒåÀ´¿´£¬µã»ýµÄʵÏÖ¿ÉÒÔ·ÖΪÁ½²½£º
£¨1£©¼ÆËãÿ¸ö·ÖÁ¿µÄ³Ë»ý£¬²¢ÔÝ´æ¸Ã½á¹û£»
£¨2£©½«ËùÓÐÁÙʱ½á¹û¼ÓºÍ¡£
ÎÒÃÇÏÈÀ´¼òµ¥ÊµÏÖϵÚÒ»²½£º
// ¶¨ÒåÎÒÃǵÄʸÁ¿³¤¶È
const int N = 64 * 256;
// ¶¨Òåÿ¸öBlockÖаüº¬µÄThreadÊýÁ¿
const int threadsPerBlock = 256;
// ¶¨Òåÿ¸öGridÖаüº¬µÄBlockÊýÁ¿, ÕâÀï32 < 64£¬ ÊÇΪÁËÄ£ÄâÏß³ÌÊýÁ¿²»×ãµÄÇé¿ö
const int blocksPerGrid = 32;
__global__ void dot( float *a, float *b, float
*c )
{
// ÉùÃ÷¹²ÏíÄÚ´æÓÃÓÚ´æ´¢ÁÙʱ³Ë»ý½á¹û£¬ÄÚ´æ´óСΪ1¸öBlockÖеÄÏß³ÌÊýÁ¿
// PS. ÿ¸öBlock¶¼Ï൱ÓÚÓÐÒ»·Ý³ÌÐò¸±±¾£¬Òò´ËÏ൱ÓÚÿ¸öBlock¶¼ÓÐÕâÑùµÄÒ»·Ý¹²ÏíÄÚ´æ
__shared__ float cache[threadsPerBlock];
// Ïß³ÌË÷Òý
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Ò»¸öBlockÖеÄÏß³ÌË÷Òý
int cacheIndex = threadIdx.x;
// ¼ÆËã·ÖÁ¿³Ë»ý£¬Í¬Ê±´¦ÀíÏ̲߳»×ãµÄÎÊÌâ
float temp = 0;
while (tid < N) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// ´æ´¢ÁÙʱ³Ë»ý½á¹û
cache[cacheIndex] = temp;
} |
Ö´ÐÐÍêÉÏÃæµÄ²¿·Ö£¬ÎÒÃÇʣϵľÍÊǰÑcacheÖеÄÖµÏà¼ÓÇóºÍ¡£ µ«ÊÇ£¬ÎÒÃDZØÐëÒª±£Ö¤ËùÓг˻ý¶¼ÒѾ¼ÆËãÍê³É£¬²ÅÄÜÈ¥¼ÆËãÇóºÍ¡£
ÃüÁîÈçÏ£º
// ¶ÔÏ߳̿éÖеÄËùÓÐÏ߳̽øÐÐͬ²½
// Ï߳̿éÖеÄËùÓÐÏ̶߳¼Ö´ÐÐÍêÇ°ÃæµÄ´úÂëºó²Å»á¼ÌÐøÍùºóÖ´ÐÐ
__syncthreads(); |
ÇóºÍ×îÖ±½ÓµÄ·½Ê½¾ÍÊÇÑ»·ÀÛ¼Ó£¬´Ëʱ¸´ÔÓ¶ÈÓëÊý×鳤¶È³ÉÕý±È¡£²»¹ýÎÒÃÇ¿ÉÒÔÔÙÓÃÒ»ÖÖ¸ü¼Ó¸ßЧµÄ·½·¨£¬Æä¸´ÔÓ¶ÈÓëÊý×鳤¶ÈµÄlog³ÉÕý±È£º½«ÖµÁ½Á½ºÏ²¢£¬ÓÚÊÇÊý¾ÝÁ¿¼õСһ°ë£¬ÔÙÖØ¸´Á½Á½ºÏ²¢Ö±ÖÁÈ«²¿¼ÆËãÍê³É¡£´úÂëÈçÏ£º
// ºÏ²¢Ëã·¨ÒªÇ󳤶ÈΪ2µÄÖ¸Êý±¶
int i = blockDim.x/2;
while (i != 0)
{
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
// ×îºó½«Ò»¸öBlockµÄÇóºÍ½á¹û½øÐб£´æ
if (cacheIndex == 0)
c[blockIdx.x] = cache[0]; |
ÏÂÃæ¸ø³öÍêÕûµÄ´úÂë(¼òµ¥Æð¼û£¬²»ÔÙ×ö´íÎó¼ì²é)£º
#include <iostream>
#include "cuda_runtime.h"
//¶¨ÒåʸÁ¿³¤¶È
const int N = 64 * 256;
// ¶¨Òåÿ¸öBlockÖаüº¬µÄThreadÊýÁ¿
const int threadsPerBlock = 256;
// ¶¨Òåÿ¸öGridÖаüº¬µÄBlockÊýÁ¿, ÕâÀï32 < 64£¬ ÊÇΪÁËÄ£ÄâÏß³ÌÊýÁ¿²»×ãµÄÇé¿ö
const int blocksPerGrid = 32;
// ºËº¯Êý£ºÊ¸Á¿µã»ý
__global__ void dot(float* a, float* b, float*
c)
{
// ÉùÃ÷¹²ÏíÄÚ´æÓÃÓÚ´æ´¢ÁÙʱ³Ë»ý½á¹û£¬ÄÚ´æ´óСΪ1¸öBlockÖеÄÏß³ÌÊýÁ¿
// PS. ÿ¸öBlock¶¼Ï൱ÓÚÓÐÒ»·Ý³ÌÐò¸±±¾£¬Òò´ËÏ൱ÓÚÿ¸öBlock¶¼ÓÐÕâÑùµÄÒ»·Ý¹²ÏíÄÚ´æ
__shared__ float cache[threadsPerBlock];
// Ïß³ÌË÷Òý
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Ò»¸öBlockÖеÄÏß³ÌË÷Òý
int cacheIndex = threadIdx.x;
// ¼ÆËã·ÖÁ¿³Ë»ý£¬Í¬Ê±´¦ÀíÏ̲߳»×ãµÄÎÊÌâ
float temp = 0.0f;
while (tid < N)
{
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
// ´æ´¢ÁÙʱ³Ë»ý½á¹û
cache[cacheIndex] = temp;
// ¶ÔÏ߳̿éÖеÄËùÓÐÏ߳̽øÐÐͬ²½
// Ï߳̿éÖеÄËùÓÐÏ̶߳¼Ö´ÐÐÍêÇ°ÃæµÄ´úÂëºó²Å»á¼ÌÐøÍùºóÖ´ÐÐ
__syncthreads();
// ºÏ²¢Ëã·¨ÒªÇ󳤶ÈΪ2µÄÖ¸Êý±¶
int i = threadsPerBlock / 2;
while (i != 0)
{
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main()
{
// ÔÚÖ÷»ú¶Ë´´½¨Êý×é
float a[N];
float b[N];
float c[threadsPerBlock];
for (size_t i = 0; i < N; i++)
{
a[i] = 1.f;
b[i] = 1.f;
}
// ÉêÇëGPUÄÚ´æ
float* dev_a = nullptr;
float* dev_b = nullptr;
float* dev_c = nullptr;
cudaMalloc((void**)&dev_a, N * sizeof(float));
cudaMalloc((void**)&dev_b, N * sizeof(float));
cudaMalloc((void**)&dev_c, blocksPerGrid
* sizeof(float));
//½«Êý¾Ý´ÓÖ÷»úcopy½øGPU
cudaMemcpy(dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
//½øÐеã»ý¼ÆËã
dot<<<32, 256>>>(dev_a, dev_b,
dev_c);
//½«¼ÆËã½á¹ûcopy»ØÖ÷»ú
cudaMemcpy(c, dev_c, blocksPerGrid * sizeof(float),
cudaMemcpyDeviceToHost);
//½«Ã¿¸öblockµÄ½á¹û½øÐÐÀÛ¼Ó
for (size_t i = 1; i < blocksPerGrid; i++)
c[0] += c[i];
// Êä³ö½á¹û
std::cout << "The ground truth is
16384, our answer is " << c[0] <<
std::endl;
//ÊÍ·ÅÄÚ´æ
cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
system("pause");
return 0;
} |
Áù¡¢ º¯ÊýÓë±äÁ¿ÀàÐÍÏÞ¶¨·û
ÔÚ֮ǰµÄС½ÚÖУ¬ÎÒÃÇÒѾÓöµ½ÁË __global__ ºÍ __shared__ÕâÁ½ÖÖÀàÐÍÏÞ¶¨·û¡£ ǰÕßÊôÓÚº¯ÊýÀàÐÍÏÞ¶¨·û£¬ºóÕßÔòÊôÓÚ±äÁ¿ÀàÐÍÏÞ¶¨·û¡£
½ÓÏÂÀ´£¬ÎÒÃÇÀ´À´Á˽âÒ»ÏÂÕâÁ½ÀàÏÞ¶¨·û¡£
6.1 º¯ÊýÀàÐÍÏÞ¶¨·û
º¯ÊýÀàÐÍÏÞ¶¨·ûÓÃÀ´±êʶº¯ÊýÔËÐÐÔÚÖ÷»ú»¹ÊÇÉ豸ÉÏ£¬º¯ÊýÓÉÖ÷»ú»¹ÊÇÉ豸µ÷Óá£
__global__
__global__ÐÞÊεĺ¯ÊýΪ ºËº¯Êý¡£
ÔËÐÐÔÚÉ豸ÉÏ£»
¿ÉÒÔÓÉÖ÷»úµ÷Óã»
¿ÉÒÔÓɼÆËãÄÜÁ¦´óÓÚ3.2µÄÉ豸µ÷Óã»
±ØÐëÓÐvoid·µ»ØÀàÐÍ£»
µ÷ÓÃʱ±ØÐëÖÆ¶¨ÔËÐвÎÊý(<<< >>>)
¸Ãº¯ÊýµÄµ÷ÓÃʱÒì²½µÄ£¬¼´¿ÉÒÔ²»±ØµÈºò¸Ãº¯ÊýÈ«²¿Íê³É£¬±ã¿ÉÒÔÔÚCPUÉϼÌÐø¹¤×÷£»
__device__
ÔËÐÐÔÚÉ豸ÉÏ£»
Ö»ÄÜÓÉÉ豸µ÷Óã»
±àÒëÆ÷»áÄÚÁªËùÓÐÈÏΪºÏÊʵÄ__device__ÐÞÊεĺ¯Êý£»
__host__
ÔËÐÐÔÚÖ÷»úÉÏ£»
Ö»ÄÜÓÉÖ÷»úµ÷Óã»
Ч¹ûµÈͬÓÚº¯Êý²»¼ÓÈκÎÏÞ¶¨·û£»
²»ÄÜÓë__global__¹²Í¬Ê¹Ó㬠µ«¿ÉÒÔºÍ__device__ÁªºÏʹÓã»
__noinline__
ÉùÃ÷²»ÔÊÐíÄÚÁª
__forceinline__
Ç¿ÖÆ±àÒëÆ÷ÄÚÁª¸Ãº¯Êý
6.2 ±äÁ¿ÀàÐÍÏÞ¶¨·û
±äÁ¿ÀàÐÍÏÞ¶¨·ûÓÃÀ´±êʶ±äÁ¿ÔÚÉ豸ÉϵÄÄÚ´æÎ»Öá£
__device__ (µ¥¶ÀʹÓÃʱ)
λÓÚ global memory space
ÉúÃüÖÜÆÚΪÕû¸öÓ¦ÓÃÆÚ¼ä(¼´ÓëapplicationͬÉúËÀ)
¿ÉÒÔ±»gridÄÚµÄËùÓÐthreads¶ÁÈ¡
¿ÉÒÔÔÚÖ÷»úÖÐÓÉÒÔϺ¯Êý¶ÁÈ¡
cudaGetSymbolAddress()
cudaGetSymbolSize()
cudaMemcpyToSymbol()
cudaMemcpyFromSymbol()
__constant__
¿ÉÒÔºÍ __device__ ÁªºÏʹÓÃ
λÓÚ constant memory space
ÉúÃüÖÜÆÚΪÕû¸öÓ¦ÓÃÆÚ¼ä
¿ÉÒÔ±»gridÄÚµÄËùÓÐthreads¶ÁÈ¡
¿ÉÒÔÔÚÖ÷»úÖÐÓÉÒÔϺ¯Êý¶ÁÈ¡
cudaGetSymbolAddress()
cudaGetSymbolSize()
cudaMemcpyToSymbol()
cudaMemcpyFromSymbol()
__shared__
¿ÉÒÔºÍ __device__ ÁªºÏʹÓÃ
λÓÚÒ»¸öBlockµÄshared memory space
ÉúÃüÖÜÆÚΪÕû¸öBlock
Ö»Äܱ»Í¬Ò»blockÄÚµÄthreads¶Áд
__managed__
¿ÉÒÔºÍ __device__ ÁªºÏʹÓÃ
¿ÉÒÔ±»Ö÷»úºÍÉ豸ÒýÓã¬Ö÷»ú»òÕßÉ豸º¯Êý¿ÉÒÔ»ñÈ¡ÆäµØÖ·»òÕß¶ÁдÆäÖµ
ÉúÃüÖÜÆÚΪÕû¸öÓ¦ÓÃÆÚ¼ä
__restrict__
¸Ã¹Ø¼ü×ÖÓÃÀ´¶ÔÖ¸Õë½øÐÐÏÞÖÆÐÔ˵Ã÷£¬Ä¿µÄÊÇΪÁ˼õÉÙÖ¸Õë±ðÃû´øÀ´µÄÎÊÌâ¡£
C99±ê×¼ÖÐÒýÈëÁËrestrictedÖ¸Õ룬ÓÃÒÔ»º½âCÓïÑÔÖÐÖ¸Õë¶þÒåÐÔµÄÎÊÌâ¡£»º½âÖ¸Õë¶þÒåÐÔÎÊÌâ¿ÉÓÃÓÚ±àÒëÆ÷µÄ´úÂëÓÅ»¯¡£ÏÂÃæÊÇÒ»¸öÖ¸Õë¶þÒåÐÔµÄÀý×Ó£º
void foo(const
float* a,
const float* b,
float* c)
{
c[0] = a[0] * b[0];
c[1] = a[0] * b[0];
c[2] = a[0] * b[0] * a[1];
c[3] = a[0] * a[1];
c[4] = a[0] * b[0];
c[5] = b[0];
...
} |
ÔÚCÓïÑÔÖУ¬Ö¸Õëa, b, ºÍc¿ÉÄÜÓжþÒåÐÔ(±ðÃû)£¬Òò¶ø¶ÔÊý×écµÄдÈë¿ÉÄÜ»á¸ü¸ÄÊý×éaºÍbµÄÔªËØµÄÖµ¡£Õâ¾ÍÒâζ×Å£¬ÎªÁ˱£Ö¤³ÌÐòµÄÕýÈ·ÐÔ£¬±àÒëÆ÷²»ÄܰÑa[0]ºÍb[0]×°ÔØÈë¼Ä´æÆ÷£¬¶ÔËüÃÇ×ö³Ë·¨£¬È»ºó°Ñ½á¹ûдÈëc[0]ºÍc[1]£¬ÕâÊÇÒòΪÓÐÕâÖÖ¿ÉÄÜa[0]ºÍc[0]ÊÇͬһ¸öµØÖ·¡£¹Ê¶ø±àÒëÆ÷ÎÞ·¨¶ÔÏàͬµÄ±í´ïʽ½øÐÐÓÅ»¯¡£
ͨ¹ý°Ña, b, cÉùÃ÷ΪrestrictedÖ¸Õ룬³ÌÐòÔ±¿ÉÒÔ¶ÏÑÔÕâЩָÕëʵ¼ÊÉÏûÓжþÒåÐÔ£¨ÕâÀËùÓеÄÖ¸Õë²ÎÊý¶¼Òª±»ÉèΪrestrict£©
void foo(const
float* __restrict__a,
const float* __restrict__ b,
float* __restrict__ c) |
ÔÚÔö¼ÓÁËrestrict¹Ø¼ü×ÖÒԺ󣬱àÒëÆ÷¿ÉÒÔ¸ù¾ÝÐèÒª¶Ô´úÂë½øÐÐÓÅ»¯£º
void foo(const
float* __restrict__ a,
const float* __restrict__ b,
float* __restrict__ c)
{
float t0 = a[0];
float t1 = b[0];
float t2 = t0 * t2;
float t3 = a[1];
c[0] = t2;
c[1] = t2;
c[4] = t2;
c[2] = t2 * t3;
c[3] = t0 * t3;
c[5] = t1;
...
} |
ÕâÑù±ã¿ÉÒÔ¼õÉٷôæ´ÎÊýºÍ¼ÆËãÁ¿£¬¶ø´ú¼ÛÊÇÔö¼Ó¼Ä´æÆ÷µÄʹÓÃÁ¿¡£¿¼Âǵ½¶îÍâµÄ¼Ä´æÆ÷ʹÓÿÉÄܻήµÍoccupancy£¬Òò´ËÕâÖÖÓÅ»¯Ò²¿ÉÄÜ»á´øÀ´¸ºÃæÐ§¹û¡£
Æß¡¢ ³£Á¿ÄÚ´æÓëʼþ
GPUͨ³£°üº¬´óÁ¿µÄÊýѧ¼ÆËãµ¥Ôª£¬Òò´ËÐÔÄÜÆ¿¾±ÍùÍù²»ÔÚÓÚоƬµÄÊýѧ¼ÆËãÍÌÍÂÁ¿£¬¶øÔÚÓÚоƬµÄÄÚ´æ´ø¿í£¬¼´ÓÐʱºòÊäÈëÊý¾ÝµÄËÙÂÊÉõÖÁ²»ÄÜά³ÖÂú¸ººÉµÄÔËËã¡£
ÓÚÊÇÎÒÃÇÐèҪһЩÊÖ¶ÎÀ´¼õÉÙÄÚ´æÍ¨ÐÅÁ¿¡£ ĿǰµÄGPU¾ùÌṩÁË64KBµÄ³£Á¿Äڴ棬²¢ÇÒ¶Ô³£Á¿ÄÚ´æ²ÉÈ¡Á˲»Í¬ÓÚÈ«¾ÖÄÚ´æµÄ´¦Àí·½Ê½¡£
ÔÚijЩ³¡¾°Ï£¬Ê¹Óó£Á¿ÄÚ´æÀ´Ì滻ȫ¾ÖÄÚ´æ¿ÉÒÔÓÐЧµØÌá¸ßͨÐÅЧÂÊ¡£
7.1 ³£Á¿ÄÚ´æ
³£Á¿ÄÚ´æ¾ßÓÐÒÔÏÂÌØµã£º
ÐèÒªÓÉ __constant__ ÏÞ¶¨·ûÀ´ÉùÃ÷
Ö»¶Á
Ó²¼þÉϲ¢Ã»ÓÐÌØÊâµÄ³£Á¿ÄÚ´æ¿é£¬³£Á¿ÄÚ´æÖ»ÊÇÖ»ÊÇÈ«¾ÖÄÚ´æµÄÒ»ÖÖÐéÄâµØÖ·ÐÎʽ
ĿǰµÄGPU³£Á¿ÄÚ´æ´óС¶¼Ö»ÓÐ64K£¬Ö÷ÒªÊÇÒòΪ³£Á¿ÄÚ´æ²ÉÓÃÁ˸ü¿ìËÙµÄ16λµØÖ·Ñ°Ö·(2^16 =
65536 = 64K)
¶ÔÓÚÊý¾Ý²»Ì«¼¯ÖлòÕßÖØÓÃÂʲ»¸ßµÄÄÚ´æ·ÃÎÊ£¬¾¡Á¿²»ÒªÊ¹Óó£Á¿Äڴ棬·ñÔòÉõÖÁ»áÂýÓÚʹÓÃÈ«¾ÖÄÚ´æ
³£Á¿ÄÚ´æÎÞÐècudaMalloc()À´¿ª±Ù£¬¶øÊÇÔÚÉùÃ÷ʱֱ½ÓÌá½»Ò»¸ö¹Ì¶¨´óС£¬±ÈÈç __constant__
float mdata[1000]
³£Á¿ÄÚ´æµÄ¸³Öµ²»ÄÜÔÙÓÃcudaMemcpy()£¬¶øÊÇʹÓÃcudaMemcpyToSymbol()
³£Á¿ÄÚ´æ´øÀ´ÐÔÄÜÌáÉýµÄÔÒòÖ÷ÒªÓÐÁ½¸ö£º
¶Ô³£Á¿ÄÚ´æµÄµ¥´Î¶Á²Ù×÷¿ÉÒԹ㲥µ½ÆäËûµÄ¡°ÁÚ½ü(nearby)¡±Ị̈߳¬Õ⽫½ÚÔ¼15´Î¶ÁÈ¡²Ù×÷
³£Á¿ÄÚ´æµÄÊý¾Ý½«»º´æÆðÀ´£¬Òò´Ë¶ÔÓÚÏàͬµØÖ·µÄÁ¬Ðø²Ù×÷½«²»»á²úÉú¶îÍâµÄÄÚ´æÍ¨ÐÅÁ¿¡£
¶ÔÓÚÔÒò1£¬Éæ¼°µ½ Ïß³ÌÊø(Warp)µÄ¸ÅÄî¡£
ÔÚCUDA¼Ü¹¹ÖУ¬Ïß³ÌÊøÊÇÖ¸Ò»¸ö°üº¬32¸öÏ̵߳ļ¯ºÏ£¬Õâ¸öÏ̼߳¯ºÏ±»¡°±àÖ¯ÔÚÒ»Æð¡±²¢ÇÒÒÔ¡°²½µ÷Ò»ÖÂ(Lockstep)¡±µÄÐÎʽִÐС£
¼´Ïß³ÌÊøÖеÄÿ¸öÏ̶߳¼½«ÔÚ²»Í¬Êý¾ÝÉÏÖ´ÐÐÏàͬµÄÖ¸Áî¡£
µ±´¦Àí³£Á¿ÄÚ´æÊ±£¬NVIDIAÓ²¼þ½«°Ñµ¥´ÎÄÚ´æ¶ÁÈ¡²Ù×÷¹ã²¥µ½Ã¿¸ö°ëÏß³ÌÊø(Half-Warp)¡£ÔÚ°ëÏß³ÌÊøÖаüº¬16¸öỊ̈߳¬¼´Ïß³ÌÊøÖÐÏß³ÌÊýÁ¿µÄÒ»°ë¡£Èç¹ûÔÚ°ëÏß³ÌÊøÖеÄÿ¸öÏ̴߳ӳ£Á¿ÄÚ´æµÄÏàͬµØÖ·É϶ÁÈ¡Êý¾Ý£¬ÄÇôGPUÖ»»á²úÉúÒ»´Î¶ÁÈ¡ÇëÇó²¢ÔÚËæºó½«Êý¾Ý¹ã²¥µ½Ã¿¸öÏ̡߳£Èç¹û´Ó³£Á¿ÄÚ´æÖжÁÈ¡´óÁ¿Êý¾Ý£¬ÄÇôÕâÖÖ·½Ê½²úÉúµÄÄÚ´æÁ÷Á¿Ö»ÊÇʹÓÃÈ«¾ÖÄÚ´æÊ±µÄ1/16¡£
¶ÔÓÚÔÒò2£¬Éæ¼°µ½»º´æµÄ¹ÜÀí
ÓÉÓÚ³£Á¿ÄÚ´æµÄÄÚÈÝÊDz»·¢Éú±ä»¯µÄ£¬Òò´ËÓ²¼þ½«Ö÷¶¯°ÑÕâ¸ö³£Á¿Êý¾Ý»º´æÔÚGPUÉÏ¡£ÔÚµÚÒ»´Î´Ó³£Á¿ÄÚ´æµÄij¸öµØÖ·É϶ÁÈ¡ºó£¬µ±ÆäËû°ëÏß³ÌÊøÇëÇóͬһ¸öµØÖ·Ê±£¬ÄÇô½«ÃüÖлº´æ£¬ÕâͬÑù¼õÉÙÁ˶îÍâµÄÄÚ´æÁ÷Á¿¡£
ÁíÒ»·½Ãæ, ³£Á¿ÄÚ´æµÄʹÓÃÒ²¿ÉÄÜ»á¶ÔÐÔÄܲúÉú¸ºÃæµÄÓ°Ïì¡£°ëÏß³ÌÊø¹ã²¥¹¦ÄÜʵ¼ÊÉÏÊÇÒ»°ÑË«Èн£¡£ËäÈ»µ±ËùÓÐ16¸öÏ̶߳¼¶ÁÈ¡ÏàͬµØÖ·Ê±£¬Õâ¸ö¹¦ÄÜ¿ÉÒÔ¼«´óÌáÉýÐÔÄÜ£¬µ«µ±ËùÓÐ16¸öÏ̷ֱ߳ð¶ÁÈ¡²»Í¬µÄµØÖ·Ê±£¬Ëüʵ¼ÊÉϻήµÍÐÔÄÜ¡£ÒòΪÕâ16´Î²»Í¬µÄ¶ÁÈ¡²Ù×÷»á±»´®Ðл¯£¬´Ó¶øÐèÒª16±¶µÄʱ¼äÀ´·¢³öÇëÇó¡£µ«Èç¹û´ÓÈ«¾ÖÄÚ´æÖжÁÈ¡£¬ÄÇôÕâЩÇëÇó»áͬʱ·¢³ö¡£
7.2 ³£Á¿ÄÚ´æÓ¦ÓÃʵÀý ¡ª¡ª ¹âÏ߸ú×Ù
ÏÂÃæÍ¨¹ýÒ»¸ö¹âÏ߸ú×ÙµÄʵÀýÀ´ËµÃ÷һϳ£Á¿ÄÚ´æµÄʹÓÃЧ¹û¡£
ÏÂÃæµÄ¹âÏ߸ú×Ù²»Éæ¼°¹âÔ´ÒÔ¼°¹âÏß·´É䣬ֻÊǼòµ¥µÄÀàËÆÓÚ¡°Í¶Ó°¡±µÄ²Ù×÷£¬ÈçÏÂͼËùʾ¡£

Í¶Ó°Æ½ÃæÇ°Ãæ»áÓдóÁ¿´æÔÚÖØµþµÄÇòÌå(ÕâÀïÎÒÖ»»ÁË1¸ö)£¬Í¶Ó°Æ½ÃæÉÏÿ¸öÏñËØµã»á·¢Éä³öÒ»ÌõÉäÏß(ÉäÏß·½ÏòÈÏΪÊÇZ·½Ïò)£¬ÎÒÃÇÐèÒªºÍËùÓÐÇòÌåÅжÏÏཻÇé¿ö¡£
Èç¹ûºÍ¶à¸öÇòÌåÏཻ£¬ÔòÑ¡Ôñ×î½üµÄ½»µã(¼´ÎÞ·¨¿´µ½ÕÚµ²µÄÇòÌå)¡£ ¸ù¾Ý½»µãµ½¶ÔÓ¦ÇòÐĵľàÀë(Z·½Ïò¾àÀë)È·¶¨Í¶Ó°µãµÄÏñËØÖµ£¬¾àÀëÔ½Ô¶ÔòÔ½ÁÁ¡£
Èç¹û¾àÀëΪÎÞÇî´ó£¬Ôò±íÃ÷ûÓÐÏཻ£¬ÔòÖÃΪºÚÉ«±³¾°¡£
ÓÉÓÚÿ¸öÏñËØ¶¼»áÉä³öÒ»ÌõÉäÏߣ¬È»ºóºÍËùÓÐÇòÌ弯ËãÏཻ£¬Òò´ËÐèÒª¾³£·ÃÎʹ̶¨µÄÇòÌå²ÎÊý¡£ Òò´Ë£¬ÎªÁËÌá¸ß·ÃÎÊЧÂÊ£¬ÎÒÃǽ«ÇòÌåÐÅÏ¢¶¨Òåµ½³£Á¿ÄÚ´æ¡£
´úÂëÈçÏÂ(ÐèÒªOpenCV)£º
#include "cuda_runtime.h"
#include "highgui.hpp"
#include <time.h> using namespace cv;
#define INF 2e10f // ¶¨ÒåÎÞÇîÔ¶¾àÀë(ÓÃÓÚ±íʾûÓÐÇòÌåÏཻ)
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES 100 //ÇòÌåÊýÁ¿
#define DIM 1024 //ͼÏñ´óС
// ÇòÌåÐÅÏ¢½á¹¹Ìå
struct Sphere
{
float r, g, b; // ÇòÌåÑÕÉ«
float radius; // ÇòÌå°ë¾¶
float x, y, z; // ÇòÌå¿Õ¼ä×ø±ê
// ¼ÆËã´Ó(ox, oy)·¢³öµÄÉäÏßÓëÇòÌåµÄ½»µã
// nΪ½»µãµ½ÇòÐĵľàÀë(Z·½Ïò¾àÀë)ÓëÇò°ë¾¶µÄ±ÈÖµ
__device__ float hit(float ox, float oy, float
*n)
{
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius)
{
float dz = sqrt(radius*radius - dx*dx - dy*dy);
*n = dz / sqrt(radius*radius);
return dz + z;
}
return -INF;
}
};
// ÉùÃ÷ÇòÌåÊý×é
__constant__ Sphere s[SPHERES];
// ¹âÏ߸ú×ٺ˺¯Êý
//__global__ void rayTracing(unsigned char*
ptr, Sphere* s)
__global__ void rayTracing(unsigned char* ptr)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
// ÒÔͼÏñÖÐÐÄÎª×ø±êÔµã
float ox = (x - DIM / 2);
float oy = (y - DIM / 2);
float r = 0, g = 0, b = 0;
float maxz = -INF;
for (int i = 0; i < SPHERES; i++)
{
float n;
float t = s[i].hit(ox, oy, &n);
// ÅжÏÊÇ·ñ´æÔÚÏཻÇòÌå
if (t > maxz)
{
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset * 3 + 2] = (int)(r * 255);
ptr[offset * 3 + 1] = (int)(g * 255);
ptr[offset * 3 + 0] = (int)(b * 255);
}
int main(int argc, char* argv[])
{
// ½¨Á¢Ê¼þÓÃÓÚ¼ÆÊ±
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
Mat bitmap = Mat(Size(DIM, DIM), CV_8UC3,
Scalar::all(0));
unsigned char *devBitmap;
(cudaMalloc((void**)&devBitmap, 3 * bitmap.rows*bitmap.cols));
// cudaMalloc((void**)&s, sizeof(Sphere)*SPHERES);
// ´´½¨Ëæ»úÇòÌå
Sphere *temps = (Sphere*)malloc(sizeof(Sphere)*SPHERES);
srand(time(0)); //Ëæ»úÊýÖÖ×Ó
for (int i = 0; i < SPHERES; i++)
{
temps[i].r = rnd(1.0f);
temps[i].g = rnd(1.0f);
temps[i].b = rnd(1.0f);
temps[i].x = rnd(1000.0f) - 500;
temps[i].y = rnd(1000.0f) - 500;
temps[i].z = rnd(1000.0f) - 500;
temps[i].radius = rnd(100.0f) + 20;
}
// cudaMemcpy(s, temps, sizeof(Sphere)*SPHERES,
cudaMemcpyHostToDevice);
// ½«ÇòÌå²ÎÊýcopy½ø³£Á¿ÄÚ´æ
cudaMemcpyToSymbol(s, temps, sizeof(Sphere)*SPHERES);
free(temps);
dim3 grids(DIM / 16, DIM / 16);
dim3 threads(16, 16);
// rayTracing<<<grids, threads>>>(devBitmap,
s);
rayTracing << <grids, threads >>
> (devBitmap);
cudaMemcpy(bitmap.data, devBitmap, 3 * bitmap.rows*bitmap.cols,
cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start,
stop);
printf("Processing time: %3.1f ms\n",
elapsedTime);
imshow("Ray Tracing", bitmap);
waitKey();
cudaFree(devBitmap);
// cudaFree(s);
return 0;
} |
ʵÑéЧ¹ûÈçÏÂͼ£º

7.3 ʹÓÃʼþÀ´²âÁ¿ÐÔÄÜ
ΪÁËÖ±¹ÛµØ¿´µ½³£Á¿ÄÚ´æ´øÀ´µÄÔöÒæ£¬ÎÒÃÇÐèÒª²âÁ¿³ÌÐòÔËÐеÄʱ¼ä¡£
ÒÔÍùµÄ»°ÎÒÃÇ´ó¶à²ÉÓÃCPU»òÕß²Ù×÷ϵͳÖеÄij¸ö¼ÆÊ±Æ÷£¬µ«ÊÇÕâºÜÈÝÒ×´øÀ´¸÷ÖÖÑÓ³Ù(°üÀ¨²Ù×÷ϵͳÏ̵߳÷¶È¡¢¸ß¾«¶ÈCPU¼ÆÊ±Æ÷¿ÉÓÃÐÔµÈ)¡£
ÌØ±ðµØ£¬ºËº¯ÊýÓëCPU³ÌÐòÊÇÒì²½Ö´Ðеģ¬Õâ¸üÒ×´øÀ´ÒâÏë²»µ½µÄÑÓ³Ù¡£µ±È»£¬Õë¶ÔÕâ¸öÎÊÌ⣬ÎÒÃÇ¿ÉÒÔʹÓÃcudaThreadSynchronize()º¯Êý½øÐÐͬ²½È»ºóÔÙÀûÓÃCPU¼ÆÊ±¡£
³ýÁ˲ÉÓÃCPUÖ÷»ú¶Ë¼ÆÊ±Ö®Í⣬¸ü׼ȷµÄ·½·¨Ó¦¸ÃÊÇÀûÓÃCUDAµÄʼþAPI¡£
¼ÆÊ±Ä£°åÈçÏ£º
cudaEvent_t start,
stop;
float time = 0.f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
/*****************************************
*********** ÐèÒª¼ÆÊ±µÄ´úÂ벿·Ö**************
******************************************/
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ime, start, stop);
std::cout << time << std::endl;
cudaEventDestroy(start);
cudaEventDestroy(stop); |
°Ë¡¢ ÎÆÀíÄÚ´æ
ͬ³£Á¿ÄÚ´æÒ»Ñù£¬ÎÆÀíÄڴ棨Texture Memory£©Ò²ÊÇÒ»ÖÖÖ»¶ÁÄÚ´æ¡£ Ö®ËùÒÔ³ÆÖ®Îª ¡°ÎÆÀí¡±£¬ÊÇÒòΪ×î³õÊÇΪͼÐÎÓ¦ÓÃÉè¼ÆµÄ¡£
µ±³ÌÐòÖдæÔÚ´óÁ¿¾Ö²¿¿Õ¼ä²Ù×÷ʱ£¬ÎÆÀíÄÚ´æ¿ÉÒÔÌá¸ßÐÔÄÜ¡£
8.1 ÎªÊ²Ã´ÎÆÀíÄÚ´æ¿ÉÒÔ¼ÓËÙ
ÎÆÀíÄÚ´æ¿ÉÒÔ¼ÓËÙÓ¦ÓÃÖ÷ÒªÔÒòÓÐÁ½·½Ã棺
1. ÎÆÀíÄÚ´æÒ²ÊÇ»º´æÔÚÆ¬Éϵģ¬Òò´ËһЩÇé¿öÏÂÏà±È´ÓоƬÍâµÄDRAMÉÏ»ñÈ¡Êý¾Ý£¬ÎÆÀíÄÚ´æ¿ÉÒÔͨ¹ý¼õÉÙÄÚ´æÇëÇóÀ´Ìá¸ß´ø¿í¡£
2. ÎÆÀíÄÚ´æÊÇÕë¶ÔͼÐÎÓ¦ÓÃÉè¼ÆµÄ£¬¿ÉÒÔ¸ü¸ßЧµØ´¦Àí¾Ö²¿¿Õ¼äµÄÄÚ´æ·ÃÎÊ¡£

´ÓÊýѧµÄ½Ç¶È£¬ÉÏͼÖеÄ4¸öµØÖ·²¢·ÇÁ¬ÐøµÄ£¬ÔÚÒ»°ãµÄCPU»º´æÖУ¬ÕâЩµØÖ·½«²»»á»º´æ¡£µ«ÓÉÓÚGPUÎÆÀí»º´æÊÇרÃÅΪÁ˼ÓËÙÕâÖÖ·ÃÎÊģʽ¶øÉè¼ÆµÄ£¬Òò´ËÈç¹ûÔÚÕâÖÖÇé¿öÖÐʹÓÃÎÆÀíÄÚ´æ¶ø²»ÊÇÈ«¾ÖÄڴ棬ÄÇô½«»á»ñµÃÐÔÄܵÄÌáÉý¡£
8.2 ÎÆÀíÄÚ´æµÄÊý¾ÝÏÞÖÆ
ÏÂͼÊdz£¼ûÄÚ´æµÄ´æ´¢Î»Öã¬ÒÔ¼°¶Áȡģʽ£º

ÓÉÉÏͼ¿ÉÒÔ¿´³ö£¬ÎÆÀíÄÚ´æÊÇÖ»¶ÁµÄ£¬¶øÇÒ¿ÉÒÔͬʱ±»Ö÷»úºÍÉ豸¶ÁÈ¡¡£
´ËÍâ£¬ÎÆÀíÄÚ´æ¿ÉÒÔ±»ÉùÃ÷Ϊ1D¡¢2D»òÕß3DÊý×飬µ«Êý×éµÄ´óСÓÐÏÞÖÆ£¬¾ßÌå¿ÉÒÔµã»÷Á´½Ó²é¿´¡¶²»Í¬¼ÆËãÄÜÁ¦GPUµÄÖ¸±ê¡·¡£
Í¬Ê±ÎÆÀíÄÚ´æÖд洢µÄÊý¾ÝÒ²±ØÐëÉùÃ÷Ϊ¹Ì¶¨ÀàÐÍ£¬¼´¸÷ÖÖ¶ÔÆëÀàÐÍÖеÄÒ»ÖÖ£¬Èç(char¡¢short¡¢int¡¢long¡¢float¡¢doubleµÈ)¡£
8.3 ÎÆÀíÄÚ´æÊ¹ÓÃ
ÎÆÀíÄÚ´æµÄʹÓÃÒÀÀµÓÚAPIº¯Êý¡£ÏÂÃæÖ±½Ó¸ø³ö³£¼ûµÄʹÓÃÁ÷³Ì£º
8.3.1 ÉùÃ÷ÎÆÀí±äÁ¿
texture<Type,
Dim, ReadMode> VarName;
//Type: Ç°ÃæÌáµ½µÄ»ù±¾µÄÕûÐͺ͸¡µãÀàÐÍ£¬ÒÔ¼°ÆäËüµÄ¶ÔÆëÀàÐÍ
//Dim£º ÎÆÀíÊý×éµÄά¶È£¬ÖµÎª1»ò2»ò3£¬Ä¬ÈÏȱʡΪ1
//ReadMode£ºcudaReadModelNormalizedFloat »ò cudaReadModelElementType(ĬÈÏ) |
cudaReadModelNormalizedFloat£ºÈç¹ûTypeΪÕûÐÍ(8bit»òÕß16bit)£¬Ôò¶ÁÈ¡Êý¾Ýʱ»á×Ô¶¯½«ÕûÐÍÊý¾Ýת»¯Îª¸¡µãÊý¡£¾ßÌ嵨£¬Èç¹ûÊÇÎÞ·ûºÅÕûÐÍ£¬Ôòת»¯Îª[0
1]Ö®¼äµÄ¸¡µãÊý£»Èç¹ûÊÇÓзûºÅÕûÐÍ£¬Ôòת»¯Îª[-1 1]Ö®¼äµÄ¸¡µãÊý¡£
cudaReadModelElementType£ºÄ¬ÈÏÖµ£¬²»½øÐÐÈκÎת»»
8.3.2 ¿ª±ÙÄÚ´æ
·ÖÅäÄڴ棬ÄÚ´æÐÎʽÓÐÁ½ÖÖ ÏßÐÔÄÚ´æ ºÍ CUDAÊý×é¡£
ÏßÐÔÄÚ´æÍ¨¹ýcudaMalloc()¡¢cudaMallocPitch()»òÕßcudaMalloc3D()·ÖÅ䣻
CUDAÊý×é¿ÉÒÔͨ¹ýcudaMalloc3DArray()»òÕßcudaMallocArray()·ÖÅ䡣ǰÕß¿ÉÒÔ·ÖÅä1D¡¢2D¡¢3DµÄÊý×飬ºóÕßÒ»°ãÓÃÓÚ·ÖÅä2DµÄCUDAÊý×é¡£
±ÈÈ翪±ÙÒ»¸ö¶þάCUDAÊý×éArrayName(64 x 64)£º
cudaChannelFormatDesc
channelDesc = cudaCreateChannelDesc<float>();
cudaArray *ArrayName;
cudaMallocArray(&ArrayName, &channelDesc,
64, 64); |
8.3.3 °ó¶¨ÎÆÀíÄÚ´æ
ÎÆÀí°ó¶¨(texture binding)µÄ×÷ÓÃÓÐÁ½¸ö£º
½«Ö¸¶¨µÄ»º³åÇø×÷ÎªÎÆÀíÀ´Ê¹Óã»
ÎÆÀíÒýÓÃ×÷ÎªÎÆÀíµÄ¡°Ãû×Ö¡±
ͨ³£Ê¹ÓÃcudaBindTexture() »ò cudaBindTexture2D() ·Ö±ð½«ÏßÐÔÄÚ´æ°ó¶¨µ½1DºÍ2DÎÆÀíÄڴ棬ʹÓÃcudaBindTextureToArray()½«CUDAÊý×éÓëÎÆÀí°ó¶¨¡£
×¢Ò⣺ ÏßÐÔÄÚ´æÖ»ÄÜÓëһά»ò¶þÎ¬ÎÆÀí°ó¶¨£»CUDAÊý×éÔò¿ÉÒÔÓëһά¡¢¶þά¡¢ÈýÎ¬ÎÆÀí°ó¶¨¡£ÇÐÁ½ÖÖ°ó¶¨ÓÐ×ÅһЩ²»Í¬µÄÌØÐÔ¡£

ÒÔcudaBindTexture()ΪÀý(ÓÐÁ½ÖÖ¼¶±ðµÄµ÷ÓÃ)
high-level API
cudaBindtexture
(size *t offset, const struct texture<T, dim,
readMode> & tex , const void * devptr,
size_t size= UINT_MAX)
// offset£º ×Ö½ÚÆ«ÒÆÁ¿
// tex: ´ý°ó¶¨µÄÎÆÀí
// devPtr£º É豸ÉÏÒÑ¿ª±ÙµÄÄÚ´æµØÖ·
// size £º ¿ª±ÙµÄÄÚ´æ´óС |
µ÷ÓÃÀý×ÓÈçÏ£º
texture<Type,
Dim, ReadMode> tex;
cudaMalloc((void**)&devPtr, size);
cudaBindTexture(NULL, tex, devPtr, size); |
low-level API
´ËÖÖÇé¿öÉÔ΢¸´ÔÓÒ»µã£¬ Êǽ«¿ª±ÙµÄ»º´æÓë¡°ÎÆÀí²Î¿¼Ïµ¡±°ó¶¨¡£
ÎÆÀí²ÎÕÕϵ£¨texture reference£©Ô¼¶¨´ÓÊý¾ÝµÄµØÖ·µ½ÎÆÀí×ø±êµÄÓ³É䷽ʽ£¬Æä¶¨ÒåÈçÏ£º
struct textureReference
{
int normalized;
enum cudaTextureFilterMode filterMode;
enum cudaTextureAddressMode addressMode[3];
struct cudaChannelFormatDesc channelDesc;
...
} |
¿ÉÄÜÓõ½µÄÖ÷ÒªÓÐÏÂÃæ3¸ö²ÎÊý£º
normalizedÉèÖÃÊÇ·ñ¶ÔÎÆÀí×ø±ê¹éÒ»»¯£¨ÎÆÀíÄÚ´æÖ§³Ö¸¡µã×ø±êË÷Òý£¬[0 ~ N]µÄ×ø±êË÷Òý»á±»¹éÒ»»¯µ½
[0 1-1/N]£©
filterModeÓÃÓÚÉèÖÃÎÆÀíµÄÂ˲¨Ä£Ê½(ÎÆÀí»º´æÒ»´ÎÔ¤È¡Ê°È¡×ø±ê¶ÔӦλÖø½½üµÄ¼¸¸öÏóÔª£¬¿ÉÒÔʵÏÖÂ˲¨Ä£Ê½)
addressMode˵Ã÷ÁËѰַ·½Ê½
´ËÊ±ÎÆÀíÄÚ´æ±»ÉùÃ÷ΪÒýÓÃÐÎʽ£º
texture<DataType,
Type, ReadMode> texRef; |
ÍêÕûµÄʹÓÃʾÀýÈçÏ£º
texture<float,
cudaTextureType1D, cudaReadModeElementType>
texRef; textureReference* texRefPtr;
cudaGetTextureReference(&texRefPtr, &texRef);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
size_t offset;
cudaBindTexture2D(&offset, texRefPtr, devPtr,
&channelDesc, width, height, pitch); |
8.3.4 Ê°È¡ÎÆÀíÄÚ´æ
ÔÚkernelÖзÃÎÊÎÆÀíÄÚ´æ³ÆÎªÎÆÀíʰȡ(texture fetching)¡£ ·ÃÎÊÎÆÀíÄÚ´æ±ØÐëʹÓÃAPIº¯Êý£¬¶ø²»ÄÜʹÓ÷½À¨ºÅË÷ÒýµÄÐÎʽ¡£ÒòΪÎÒÃÇÐèÒªAPIº¯Êý½«¶ÁÈ¡ÇëÇóת·¢µ½ÎÆÀíÄÚ´æ¶ø²»ÊÇÈ«¾ÖÄÚ´æ¡£
¶ÔÓÚÏßÐÔ´æ´¢Æ÷°ó¶¨µÄÎÆÀí£¬Ê¹ÓÃtex1Dfetch()·ÃÎÊ£¬²ÉÓõÄÎÆÀí×ø±êÊÇÕûÐÍ¡£ ¶ÔÓëһά¡¢¶þά¡¢ÈýάcudaÊý×é°ó¶¨µÄÎÆÀí£¬·Ö±ðʹÓÃtex1D(),
tex2D() ºÍ tex3D()º¯Êý·ÃÎÊ£¬²¢ÇÒʹÓø¡µãÐÍÎÆÀí×ø±ê¡£
´ËÍ⣬tex1Dfetch()ÊÇÒ»¸ö±àÒëÆ÷ÄÚÖú¯Êý£¬Òò´ËÎÆÀíÒýÓñØÐëÉùÃ÷ΪÎļþÓòÄÚµÄÈ«¾Ö±äÁ¿£¬ÒòΪ±àÒëÆ÷ÔÚ±àÒë½×¶ÎÐèÒªÖªµÀtex1Dfetch()¶ÔÄÄÐ©ÎÆÀí²ÉÑù¡£
8.3.5 ½â°óÎÆÀíÄÚ´æ
×îºóµ±³ÌÐò½áÊøÊ±£¬ÎÒÃÇÐèÒªÊÍ·Å֮ǰ¿ª±ÙµÄÄÚ´æ²¢½â³ýÎÆÀíÄÚ´æµÄ°ó¶¨¡£
ʾÀý´úÂëÈçÏ£º
cudaUnbindTexture
(tex);
cudaFree(devPtr); |
8.4 ʹÓÃÎÆÀíÄÚ´æÊµÏÖ¾ùÖµÂ˲¨
ÏÂÃæ¸ø³öÒ»¸öÎÒʵÏֵļòµ¥µÄ3x3¾ùÖµÂ˲¨£¬¼´Â˲¨ºóµÄÒ»¸öÏñËØÖµÊÇÆäÖÜΧ3x3·¶Î§ÄÚ9¸öÏñËØÖµµÄƽ¾ùÖµ¡£
¾ùÖµÂ˲¨¿ÉÒÔÓÐЧȥ³ý¸ß˹ÔëÉù¡£
#include "cuda_runtime.h"
#include "opencv.hpp"
#include "highgui.hpp"
// ÉùÃ÷2DÎÆÀíÄÚ´æÒýÓÃ
texture<uchar, 2, cudaReadModeElementType>
texRef;
// ºËº¯Êý£¬ ÓÃÓÚ¾ùÖµÂ˲¨
__global__ void meanfilter_kernel(uchar* dstcuda,
int width)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// 3x3·¶Î§ÄÚÇóºÍ£¬ È»ºó¼ÆËã¾ùÖµ
// ×¢Òâtex2D»á×Ô¶¯¶ÔË÷ÒýÔ½½ç½øÐд¦Àí£¬Òò´Ë²»ÓÃÅжÏx-1ÊÇ·ñСÓÚ0
dstcuda[y * width + x] = ( tex2D(texRef, x -
1, y - 1) + tex2D(texRef, x, y - 1) + tex2D(texRef,
x + 1, y - 1) +
tex2D (texRef, x - 1, y) + tex2D(texRef, x, y)
+ tex2D(texRef, x + 1, y) +
tex2D(texRef, x - 1, y + 1) + tex2D(texRef,
x, y + 1) + tex2D(texRef, x + 1, y + 1)) / 9;
}
int main()
{
// ¶ÁÈ¡´ýÂ˲¨Í¼Æ¬(1024x640º¬¸ß˹ÔëÉù»Ò¶Èͼ)
cv::Mat srcImg = cv::imread ("gray_scarleet_noisy.jpg", cv::IMREAD_GRAYSCALE);
// ¿ª±ÙϵͳÄÚ´æ
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc
(8,0,0,0,cudaChannelFormatKindUnsigned);
cudaArray* srcArray;
cudaMallocArray (&srcArray, &channelDesc,
srcImg.cols, srcImg.rows);
// ½«Í¼Ïñcopy½øÄÚ´æ
cudaMemcpyToArray (srcArray, 0, 0, srcImg.data,
srcImg.cols * srcImg.rows, cudaMemcpyHostToDevice);
// ½«ÄÚ´æÓëÎÆÀíÒýÓðó¶¨
cudaBindTextureToArray (&texRef,srcArray,&channelDesc);
// ÉùÃ÷ÓÃÓÚ´¢´æÂ˲¨ºóµÄͼÏñ
cv::Mat dstImg = cv::Mat(cv::Size (srcImg.cols,
srcImg.rows), CV_8UC1);
uchar * dstcuda;
cudaMalloc((void**)& dstcuda, srcImg.cols
* srcImg.rows * sizeof(uchar));
// ÔËÐк˺¯Êý
dim3 dimBlock(32, 32);
dim3 dimGrid ((srcImg.cols + dimBlock.x - 1)
/ dimBlock.x, (srcImg.rows + dimBlock.y - 1)
/ dimBlock.y);
meanfilter_kernel << <dimGrid, dimBlock
>> > (dstcuda, srcImg.cols);
// Ïß³Ìͬ²½
cudaThreadSynchronize();
// ½«Êý¾Ýcopy»ØÖ÷»ú
cudaMemcpy (dstImg.data, dstcuda, srcImg.cols
* srcImg.rows * sizeof(uchar), cudaMemcpyDeviceToHost);
// ½â³ý°ó¶¨²¢ÊÍ·ÅÄÚ´æ
cudaUnbindTexture(&texRef);
cudaFreeArray(srcArray);
cudaFree(dstcuda);
// ÏÔʾЧ¹ûͼ
cv::imshow("Source Image", srcImg);
cv::imshow("Result Image", dstImg);
cvWaitKey();
return 0;
} |
Ч¹ûÈçÏ£º
¸ß˹ÔëÉùͼ

Â˲¨ºó

¾Å¡¢ Ô×Ó²Ù×÷
Ô×Ó²Ù×÷ ÊÇÖ¸¶ÔÈ«¾ÖºÍ¹²ÏíÄÚ´æÖеÄ32λ»òÕß64λÊý¾Ý½øÐÐ ¡°¶ÁÈ¡-ÐÞ¸Ä-¸²Ð´¡±ÕâÒ»²Ù×÷¡£
Ô×Ó²Ù×÷¿ÉÒÔ¿´×÷ÊÇÒ»ÖÖ×îСµ¥Î»µÄÖ´Ðйý³Ì¡£ ÔÚÆäÖ´Ðйý³ÌÖУ¬²»ÔÊÐíÆäËû²¢ÐÐÏ̶߳ԸñäÁ¿½øÐжÁÈ¡ºÍдÈëµÄ²Ù×÷¡£
Èç¹û·¢Éú¾ºÕù£¬ÔòÆäËûÏ̱߳ØÐëµÈ´ý¡£
ÏÂÃæÏȸø³öÔ×Ó²Ù×÷º¯ÊýµÄÁÐ±í£¬ºóÐø»á¸ø³öÒ»¸öÓ¦ÓÃÀý×Ó¡£
9.1 Ô×Ó²Ù×÷º¯ÊýÁбí
9.1.1 atomicAdd()
int atomicAdd(int*
address, int val);
unsigned int atomicAdd(unsigned int* address,
unsigned int val);
unsigned long long int atomicAdd(unsigned long
long int* address, unsigned long long int val);
float atomicAdd(float* address, float val);
double atomicAdd(double* address, double val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ»ò64λ×Öold£¬¼ÆËã(old + val)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
×¢Ò⣺
32λ¸¡µãÊýµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ2.0µÄGPU
64λ¸¡µãÊýµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ6.0µÄGPU
µ«¿ÉÒÔͨ¹ýÒÔϲÙ×÷ÔÚ¼ÆËãÄÜÁ¦²»×ãµÄGPUÉÏʵÏÖ¸¡µãÊýÔ×Ó²Ù×÷£º
#if __CUDA_ARCH__
< 600
__device__ double atomicAdd(double* address, double
val)
{
unsigned long long int* address_as_ull = (unsigned
long long int*)address; unsigned long long int
old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val
+ __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang
in case of NaN (since NaN != NaN)
}
while (assumed != old);
return __longlong_as_double(old);
}
#endif |
9.1.2 atomicSub()
int atomicSub(int*
address, int val);
unsigned int atomicSub(unsigned int* address,
unsigned int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Öold£¬¼ÆËã(old - val)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
9.1.3 atomicExch()
int atomicExch(int*
address, int val);
unsigned int atomicExch(unsigned int* address,
unsigned int val);
unsigned long long int atomicExch(unsigned long
long int* address, unsigned long long int val);
float atomicExch(float* address, float val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ»ò64λ×Öold£¬²¢½«val ´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÁ½Ïî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
9.1.4 atomicMin()
int atomicMin(int*
address, int val);
unsigned int atomicMin(unsigned int* address,
unsigned int val);
unsigned long long int atomicMin(unsigned long
long int* address, unsigned long long int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Ö»ò64λ×Öold£¬¼ÆËãold ºÍval µÄ×îСֵ£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold>
×¢Ò⣺
64λµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ3.5µÄGPU
9.1.5 atomicMax()
ͬatomicMin()¡£
9.1.6 atomicInc()
unsigned int atomicInc(unsigned
int* address, unsigned int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Öold£¬¼ÆËã ((old >= val)
? 0 : (old+1))£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
9.1.7 atomicDec()
unsigned int
atomicDec(unsigned int* address, unsigned int
val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Öold£¬¼ÆËã (((old == 0) |
(old > val)) ? val : (old-1))£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
9.1.8 atomicCAS()
int atomicCAS(int*
address, int compare, int val);
unsigned int atomicCAS(unsigned int* address,
unsigned int compare, unsigned int val);
unsigned long long int atomicCAS(unsigned long
long int* address, unsigned long long int compare,
unsigned long long int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ»ò64λ×Öold£¬¼ÆËã (old == compare
? val : old)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold£¨±È½Ï²¢½»»»£©¡£
9.1.9 atomicAnd()
int atomicAnd(int*
address, int val);
unsigned int atomicAnd(unsigned int* address,
unsigned int val);
unsigned long long int atomicAnd(unsigned long
long int* address, unsigned long long int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Ö»ò64λ×Öold£¬¼ÆËã (old &
val)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
×¢Ò⣺
64λµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ3.5µÄGPU
9.1.10 atomicOr()
int atomicOr(int*
address, int val);
unsigned int atomicOr(unsigned int* address, unsigned
int val);
unsigned long long int atomicOr(unsigned long
long int* address, unsigned long long int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Ö»ò64λ×Öold£¬¼ÆËã (old | val)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
×¢Ò⣺
64λµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ3.5µÄGPU
9.1.11 atomicXor()
int atomicXor(int*
address, int val);
unsigned int atomicXor(unsigned int* address,
unsigned int val);
unsigned long long int atomicXor(unsigned long
long int* address, unsigned long long int val); |
¶ÁȡλÓÚÈ«¾Ö»ò¹²Ïí´æ´¢Æ÷ÖеØÖ·address´¦µÄ32λ×Ö»ò64λ×Öold£¬¼ÆËã (old ^ val)£¬²¢½«½á¹û´æ´¢ÔÚ´æ´¢Æ÷µÄͬһµØÖ·ÖС£ÕâÈýÏî²Ù×÷ÔÚÒ»´ÎÔ×ÓÊÂÎñÖÐÖ´ÐС£¸Ãº¯Êý½«·µ»Øold¡£
×¢Ò⣺
64λµÄ²Ù×÷Ö»ÊÊÓÃÓÚ¼ÆËãÄÜÁ¦´óÓÚ3.5µÄGPU
9.2 Ö±·½Í¼Í³¼Æ
ÔÚÒ»¿ªÊ¼ÎÒÃǾÍÌáµ½¹ý£¬Ô×Ó²Ù×÷ÊÇΪÁ˱£Ö¤Ã¿´ÎÖ»ÄÜÓÐÒ»¸öÏ̶߳ԱäÁ¿½øÐжÁд£¬¶øÆäËüÏ̱߳ØÐëµÈ´ý¡£ ÕâÑù¿ÉÒÔÓÐЧµØ±ÜÃâ¶à¸öÏ̷߳ÃÎʺÍÐÞ¸ÄÒ»¸ö±äÁ¿´øÀ´µÄ²»È·¶¨ÎÊÌâ¡£
ÏÂÃæÎÒÃǵÄÀý×Ó¾ÍÊǶÔÒ»¶Ñ·¶Î§ÔÚ[0 255]µÄÊý½øÐÐÖ±·½Í¼Í³¼Æ¡£ÓÉÓÚÿÓöµ½Ò»¸öÊýÎÒÃǾÍÒªÔÚ¶ÔӦͳ¼ÆÖµ´¦¼Ó1£¬Òò´Ë¶àÏ̲߳Ù×÷ÍùͬһλÖüÓ1µÄʱºòºÜÈÝÒ׳öÏÖÎÊÌâ¡£Òò´ËÎÒÃDzÉÓÃÔ×Ó²Ù×÷¡£
¹ØÓÚÏÂÃæµÄ¼òµ¥³ÌÐò»¹Óм¸µã˵Ã÷:
ΪÁËÑéÖ¤GPU½á¹ûµÄ׼ȷÐÔ£¬ÎÒÊ×ÏȲÉÓÃCPU½øÐÐÖ±·½Í¼Í³¼Æ£¬È»ºóÔÚGPUÖжÔÊý¾Ý½øÐÐ ¼õ¼ÆÊý£¬Èç¹û×îºóͳ¼Æ½á¹û¾ùΪ0£¬Ôò˵Ã÷GPUºÍCPUͳ¼Æ½á¹ûÒ»Ö¡£
³ÌÐòÖÐÿһ¸öblock°üº¬256¸öỊ̈߳¬È»ºóÎÒÔÚÿһ¸öblockÖпª±ÙÁËÒ»¿é¹²ÏíÄÚ´ætemp£¬²¢½«Ò»¸öblockÖеÄͳ¼Æ½á¹û´æ´¢µ½tempÉÏ£¬×îºóÔÚ¶ÔËùÓÐblockµÄ½á¹û½øÐÐÕûºÏ¡£ÕâЩ²Ù×÷Ö÷ÒªÊÇΪÁ˱ÜÃâÔÚÈ«¾ÖÄÚ´æÉϽøÐÐÔ×Ó²Ù×÷£¬·ñÔòËÙ¶È»á·Ç³£Âý¡£
¶ÔÓÚGPUµÄ¼ÆËãʱ¼äÎÒûÓп¼ÂÇÊý¾ÝÔÚÖ÷»úºÍÉ豸֮¼äµÄͨÐÅËù»¨·ÑµÄʱ¼ä¡£ ÊÂʵÉÏ£¬ÕâÀïµÄͨÐÅÊÇÒ»¼þÏà¶ÔºÜÂýµÄ¹¤×÷¡£¶ÁÕß¿ÉÒÔ×Ô¼º²âÁ¿ÏÂÕû¸ö°üº¬Í¨ÐÅËù»¨·ÑµÄʱ¼ä¡£
#include <iostream>
#include "cuda_runtime.h"
#include "time.h"
using namespace std;
#define num (256 * 1024 * 1024)
// ºËº¯Êý
// ×¢Ò⣬ΪÁË·½±ãÑéÖ¤GPUµÄͳ¼Æ½á¹û£¬ÕâÀï²ÉÓÃÁË"ÄæÖ±·½Í¼"£¬
// ¼´Ã¿·¢ÏÖÒ»¸öÊý×Ö£¬¾Í´ÓCPUµÄͳ¼Æ½á¹ûÖмõ1
__global__ void hist (unsigned char* inputdata,
int* outputhist, long size)
{
// ¿ª±Ù¹²ÏíÄڴ棬·ñÔòÔÚÈ«¾ÖÄÚ´æ²ÉÓÃÔ×Ó²Ù×÷»á·Ç³£Âý(ÒòΪ³åͻ̫¶à)
__shared__ int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
// ¼ÆËãÏß³ÌË÷Òý¼°Ïß³ÌÆ«ÒÆÁ¿
int ids = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
while (ids < size)
{
//²ÉÓÃÔ×Ó²Ù×÷¶ÔÒ»¸öblockÖеÄÊý¾Ý½øÐÐÖ±·½Í¼Í³¼Æ
atomicAdd(&temp[inputdata[ids]],1);
ids += offset;
}
// µÈ´ýͳ¼ÆÍê³É£¬¼õȥͳ¼Æ½á¹û
__syncthreads();
atomicSub (&outputhist[threadIdx.x], temp[threadIdx.x]);
}
int main()
{
// Éú³ÉËæ»úÊý¾Ý [0 255]
unsigned char* cpudata = new unsigned char[num];
for (size_t i = 0; i < num; i++)
cpudata[i] = s tatic_cast<unsigned char> (rand()
% 256);
// ÉùÃ÷Êý×éÓÃÓڼǼͳ¼Æ½á¹û
int cpuhist[256];
memset(cpuhist, 0, 256 * sizeof(int));
/******************************* CPU²âÊÔ´úÂë *********************************/
clock_t cpu_start, cpu_stop;
cpu_start = clock();
for (size_t i = 0; i < num; i++)
cpuhist[cpudata[i]] ++;
cpu_stop = clock();
cout << "CPU time: " <<
(cpu_stop - cpu_start) << "ms"
<< endl;
/******************************* GPU²âÊÔ´úÂë *********************************/
//¶¨ÒåʼþÓÃÓÚ¼ÆÊ±
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//¿ª±ÙÏÔ´æ²¢½«Êý¾Ýcopy½øÏÔ´æ
unsigned char* gpudata;
cudaMalloc((void**)& gpudata,num*sizeof(unsigned
char));
cudaMemcpy (gpudata, cpudata, num*sizeof(unsigned
char), cudaMemcpyHostToDevice);
// ¿ª±ÙÏÔ´æÓÃÓÚ´æ´¢Êä³öÊý¾Ý,²¢½«CPUµÄ¼ÆËã½á¹ûcopy½øÈ¥
int* gpuhist;
cudaMalloc((void**) &gpuhist, 256*sizeof(int));
cudaMemcpy (gpuhist, cpuhist, 256*sizeof(int),
cudaMemcpyHostToDevice);
// Ö´Ðк˺¯Êý²¢¼ÆÊ±
cudaEventRecord(start, 0);
hist << <1024, 256 >> > (gpudata,gpuhist,num);
cudaEventRecord(stop, 0);
// ½«½á¹ûcopy»ØÖ÷»ú
int histcpu[256];
cudaMemcpy (cpuhist,gpuhist,256*sizeof(int), cudaMemcpyDeviceToHost);
// Ïú»Ù¿ª±ÙµÄÄÚ´æ
cudaFree(gpudata);
cudaFree(gpuhist);
delete cpudata;
// ¼ÆËãGPU»¨·Ñʱ¼ä²¢Ïú»Ù¼ÆÊ±Ê¼þ
cudaEventSynchronize(stop);
float gputime;
cudaEventElapsedTime (&gputime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "GPU time: " <<
gputime << "ms" << endl;
// ÑéÖ¤½á¹û
long result = 0;
for (size_t i = 0; i < 256; i++)
result += cpuhist[i];
if (result == 0)
cout << " GPU has the same result
with CPU." << endl;
else
cout << " Error: GPU has a different
result with CPU." << endl;
system("pause");
return 0;
} |
ÉÏÃæµÄÖ´Ðнá¹ûΪ£º
CPU time: 188ms
GPU time: 26.7367ms
GPU has the same result with CPU. |
Ê®¡¢ Á÷²¢ÐÐ
ÎÒÃÇÇ°ÃæÑ§Ï°µÄCUDA²¢ÐгÌÐòÉè¼Æ£¬»ù±¾É϶¼ÊÇÔÚÒ»ÅúÊý¾ÝÉÏÀûÓôóÁ¿Ïß³ÌʵÏÖ²¢ÐС£ ³ý´ËÖ®Í⣬ NVIDIAϵÁÐGPU»¹Ö§³ÖÁíÍâÒ»ÖÖÀàÐ͵IJ¢ÐÐÐÔ
¡ª¡ª Á÷¡£
GPUÖеÄÁ÷²¢ÐÐÀàËÆÓÚCPUÉϵÄÈÎÎñ²¢ÐУ¬¼´Ã¿¸öÁ÷¶¼¿ÉÒÔ¿´×÷ÊÇÒ»¸ö¶ÀÁ¢µÄÈÎÎñ£¬Ã¿¸öÁ÷ÖеĴúÂë²Ù×÷˳ÐòÖ´ÐС£
ÏÂÃæ´ÓÁ÷²¢ÐеĻù´¡µ½Ê¹ÓÃÀ´ËµÃ÷¡£
10.1 Ò³Ëø¶¨ÄÚ´æ
Á÷²¢ÐеÄʹÓÃÐèÒªÓÐÓ²¼þÖ§³Ö£º¼´±ØÐëÊÇÖ§³ÖÉè±¸ÖØµþ¹¦ÄܵÄGPU¡£
ͨ¹ýÏÂÃæµÄ´úÂë²éѯÉ豸ÊÇ·ñÖ§³ÖÉè±¸ÖØµþ¹¦ÄÜ£º
cudaDeviceProp
mprop;
cudaGetDeviceProperties(&mprop,0);
if (!mprop.deviceOverlap)
{
cout << "Device not support overlaps,
so stream is invalid!" << endl;
} |
Ö»ÓÐÖ§³ÖÉè±¸ÖØµþ£¬GPUÔÚÖ´ÐÐÒ»¸öºËº¯ÊýµÄͬʱ£¬²Å¿ÉÒÔͬʱÔÚÉ豸ÓëÖ÷»úÖ®¼äÖ´Ðи´ÖƲÙ×÷¡£ µ±È»£¬ÕâÖÖ¸´ÖƲÙ×÷ÐèÒªÔÚÒ»ÖÖÌØÊâµÄÄÚ´æÉϲſÉÒÔ½øÐÐ
¡ª¡ª Ò³Ëø¶¨ÄÚ´æ¡£
Ò³Ëø¶¨Äڴ棺 ÐèÒªÓÉcudaHostAlloc()·ÖÅ䣬ÓÖ³ÆÎª¹Ì¶¨Äڴ棨Pinned Memory£©»òÕß²»¿É·ÖÒ³ÄÚ´æ¡£
²Ù×÷ϵͳ½«²»»á¶ÔÕâ¿éÄÚ´æ·ÖÒ³²¢½»»»µ½´ÅÅÌÉÏ£¬´Ó¶øÈ·±£Á˸ÃÄÚ´æÊ¼ÖÕפÁôÔÚÎïÀíÄÚ´æÖУ¬ÒòΪÕâ¿éÄڴ潫²»»á±»ÆÆ»µ»òÕßÖØÐ¶¨Î»¡£
ÓÉÓÚgpuÖªµÀÄÚ´æµÄÎïÀíµØÖ·£¬Òò´Ë¿ÉÒÔͨ¹ý¡°Ö±½ÓÄÚ´æ·ÃÎÊ£¨Direct Memory Access£¬DMA£©¡±
Ö±½ÓÔÚgpuºÍÖ÷»úÖ®¼ä¸´ÖÆÊý¾Ý¡£
¿É·ÖÒ³Äڴ棺 malloc()·ÖÅäµÄÄÚ´æÊDZê×¼µÄ¡¢¿É·ÖÒ³µÄ£¨Pagable£©Ö÷»úÄÚ´æ¡£ ¿É·ÖÒ³ÄÚ´æÃæÁÙ×ÅÖØ¶¨Î»µÄÎÊÌ⣬Òò´ËʹÓÿɷÖÒ³ÄÚ´æ½øÐи´ÖÆÊ±£¬¸´ÖÆ¿ÉÄÜÖ´ÐÐÁ½´Î²Ù×÷£º´Ó¿É·ÖÒ³ÄÚ´æ¸´ÖÆµ½Ò»¿é¡°ÁÙʱ¡±Ò³Ëø¶¨Äڴ棬Ȼºó´ÓÒ³Ëø¶¨ÄÚ´æ¸´ÖÆµ½GPU¡£
ËäÈ»ÔÚÒ³Ëø¶¨ÄÚ´æÉÏÖ´Ðи´ÖƲÙ×÷ЧÂʱȽϸߣ¬µ«ÏûºÄÎïÀíÄÚ´æ¸ü¶à¡£Òò´Ë£¬Í¨³£¶ÔcudaMemcpy()µ÷ÓõÄÔ´ÄÚ´æ»òÕßÄ¿±êÄÚ´æ²ÅʹÓ㬶øÇÒʹÓÃÍê±ÏÁ¢¼´ÊÍ·Å¡£
10.2 Á÷²¢ÐлúÖÆ
Á÷²¢ÐÐÊÇÖ¸ÎÒÃÇ¿ÉÒÔ´´½¨¶à¸öÁ÷À´Ö´Ðжà¸öÈÎÎñ£¬ µ«Ã¿¸öÁ÷¶¼ÊÇÒ»¸öÐèÒª°´ÕÕ˳ÐòÖ´ÐеIJÙ×÷¶ÓÁС£ ÄÇôÎÒÃÇÈçºÎʵÏÖ³ÌÐò¼ÓËÙ£¿
ÆäºËÐľÍÔÚÓÚ£¬ÔÚÒ³Ëø¶¨ÄÚ´æÉϵÄÊý¾Ý¸´ÖÆÊǶÀÁ¢Óں˺¯ÊýÖ´Ðе쬼´ÎÒÃÇ¿ÉÒÔÔÚÖ´Ðк˺¯ÊýµÄͬʱ½øÐÐÊý¾Ý¸´ÖÆ¡£
ÕâÀïµÄ¸´ÖÆÐèҪʹÓÃcudaMemcpyAsync()£¬Ò»¸öÒÔÒì²½Ö´Ðеĺ¯Êý¡£µ÷ÓÃcudaMemcpyAsync()ʱ£¬Ö»ÊÇ·ÅÖÃÒ»¸öÇëÇ󣬱íʾÔÚÁ÷ÖÐÖ´ÐÐÒ»´ÎÄÚ´æ¸´ÖÆ²Ù×÷¡£µ±º¯Êý·µ»ØÊ±£¬ÎÒÃÇÎÞ·¨È·±£¸´ÖƲÙ×÷ÒѾ½áÊø¡£ÎÒÃÇÄܹ»µÃµ½µÄ±£Ö¤ÊÇ£¬¸´ÖƲÙ×÷¿Ï¶¨»áµ±ÏÂÒ»¸ö±»·ÅÈëÁ÷ÖеIJÙ×÷֮ǰִÐС£(Ïà±È֮ϣ¬cudaMemcpy()ÊÇÒ»¸öͬ²½Ö´Ðк¯Êý¡£µ±º¯Êý·µ»ØÊ±£¬¸´ÖƲÙ×÷ÒÑÍê³É¡£)
ÒÔ¼ÆËã a + b = cΪÀý£¬¼ÙÈçÎÒÃÇ´´½¨ÁËÁ½¸öÁ÷£¬Ã¿¸öÁ÷¶¼Êǰ´Ë³ÐòÖ´ÐУº
¸´ÖÆa(Ö÷»úµ½GPU) ->
¸´ÖÆb(Ö÷»úµ½GPU) -> ºËº¯Êý¼ÆËã -> ¸´ÖÆc(GPUµ½Ö÷»ú) |

ÈçÉÏͼ£¬¸´ÖƲÙ×÷ºÍºËº¯ÊýÖ´ÐÐÊÇ·Ö¿ªµÄ£¬µ«ÓÉÓÚÿ¸öÁ÷ÄÚ²¿ÐèÒª°´Ë³ÐòÖ´ÐУ¬Òò´Ë¸´ÖÆcµÄ²Ù×÷ÐèÒªµÈ´ýºËº¯ÊýÖ´ÐÐÍê±Ï¡£
ÓÚÊÇ£¬Õû¸ö³ÌÐòÖ´ÐеÄʱ¼äÏßÈçÏÂͼ£º(¼ýÍ·±íʾÐèÒªµÈ´ý)

´ÓÉÏÃæµÄʱ¼äÏßÎÒÃÇ¿ÉÒÔÆô·¢Ê½µÄ˼¿¼Ï£ºÈçºÎµ÷Õûÿ¸öÁ÷µ±ÖеIJÙ×÷˳ÐòÀ´»ñµÃ×î´óµÄÊÕÒæ£¿ Ìá¸ßÖØµþÂÊ¡£
ÈçÏÂͼËùʾ£¬¼ÙÈç¸´ÖÆÒ»·ÝÊý¾ÝµÄʱ¼äºÍÖ´ÐÐÒ»´ÎºËº¯ÊýµÄʱ¼ä²î²»¶à£¬ÄÇôÎÒÃÇ¿ÉÒÔ²ÉÓý»²æÖ´ÐеIJßÂÔ£º

ÓÉÓÚÁ÷0µÄaºÍbÒѾ׼±¸Íê³É£¬Òò´Ëµ±¸´ÖÆÁ÷1µÄbʱ£¬¿ÉÒÔͬ²½Ö´ÐÐÁ÷0µÄºËº¯Êý¡£ ÕâÑùÕû¸öʱ¼äÏߣ¬Ïà½ÏÓÚ֮ǰµÄ²Ù×÷ºÜÃ÷ÏÔÉÙµôÁËÁ½¿é²Ù×÷¡£
10.3 Á÷²¢ÐÐʾÀý
ÓëÁ÷Ïà¹ØµÄ³£Óú¯ÊýÈçÏ£º
// ´´½¨ÓëÏú»Ù
cudaStream_t stream//¶¨ÒåÁ÷
cudaStreamCreate(cudaStream_t * s)//´´½¨Á÷
cudaStreamDestroy(cudaStream_t s)//Ïú»ÙÁ÷
//ͬ²½
cudaStreamSynchronize()//ͬ²½µ¥¸öÁ÷£ºµÈ´ý¸ÃÁ÷ÉϵÄÃüÁî¶¼Íê³É
cudaDeviceSynchronize()//ͬ²½ËùÓÐÁ÷£ºµÈ´ýÕû¸öÉ豸ÉÏÁ÷¶¼Íê³É
cudaStreamWaitEvent()//µÈ´ýij¸öʼþ½áÊøºóÖ´ÐиÃÁ÷ÉϵÄÃüÁî
cudaStreamQuery()//²éѯһ¸öÁ÷ÈÎÎñÊÇ·ñÍê³É
//»Øµ÷
cudaStreamAddCallback()//ÔÚÈκεã²åÈë»Øµ÷º¯Êý
//ÓÅÏȼ¶
cudaStreamCreateWithPriority()
cudaDeviceGetStreamPriorityRange() |
ÏÂÃæ¸ø³öÒ»¸ö2¸öÁ÷Ö´ÐÐa + b = cµÄʾÀý£¬ ÎÒÃǼÙÉèÊý¾ÝÁ¿·Ç³£´ó£¬ÐèÒª½«Êý¾Ý²ð·Ö£¬Ã¿´Î¼ÆËãÒ»²¿·Ö¡£
#include <iostream>
#include "cuda_runtime.h"
using namespace std;
#define N (1024*256) // ÿ´Î´¦ÀíµÄÊý¾ÝÁ¿
#define SIZE (N*20) //Êý¾Ý×ÜÁ¿
// ºËº¯Êý£¬a + b = c
__global__ void add(int* a, int* b, int* c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
c[i] = a[i] + b[i];
}
int main()
{
// »ñÈ¡0ºÅGPUµÄÊôÐÔ²¢ÅжÏÊÇ·ñÖ§³ÖÉè±¸ÖØµþ¹¦ÄÜ
cudaDeviceProp mprop;
cudaGetDeviceProperties(&mprop,0);
if (!mprop.deviceOverlap)
{
cout << "Device not support overlaps,
so stream is invalid!" << endl;
return 0;
}
// ´´½¨¼ÆÊ±Ê¼þ
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
float elapsedTime;
// ´´½¨Á÷
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
// ¿ª±ÙÖ÷»úÒ³Ëø¶¨Äڴ棬²¢Ëæ»ú³õʼ»¯Êý¾Ý
int *host_a, *host_b, *host_c;
cudaHostAlloc((void**)&host_a, SIZE*sizeof(int),
cudaHostAllocDefault);
cudaHostAlloc((void**)&host_b, SIZE*sizeof(int),
cudaHostAllocDefault);
cudaHostAlloc((void**)&host_c, SIZE*sizeof(int),
cudaHostAllocDefault);
for (size_t i = 0; i < SIZE; i++)
{
host_a[i] = rand();
host_b[i] = rand();
}
// ÉùÃ÷²¢¿ª±ÙÏà¹Ø±äÁ¿ÄÚ´æ
int *dev_a0, *dev_b0, *dev_c0; //ÓÃÓÚÁ÷0µÄÊý¾Ý
int *dev_a1, *dev_b1, *dev_c1; //ÓÃÓÚÁ÷1µÄÊý¾Ý
cudaMalloc((void**)&dev_a0,N*sizeof(int));
cudaMalloc((void**)&dev_b0, N*sizeof(int));
cudaMalloc((void**)&dev_c0, N*sizeof(int));
cudaMalloc((void**)&dev_a1, N*sizeof(int));
cudaMalloc((void**)&dev_b1, N*sizeof(int));
cudaMalloc((void**)&dev_c1, N*sizeof(int));
/************************ ºËÐļÆË㲿·Ö ***************************/
cudaEventRecord(start, 0);
for (size_t i = 0; i < SIZE; i += 2*N)
{
// ¸´ÖÆÁ÷0Êý¾Ýa
cudaMemcpyAsync(dev_a0, host_a + i, N*sizeof(int),
cudaMemcpyHostToDevice, stream0);
// ¸´ÖÆÁ÷1Êý¾Ýa
cudaMemcpyAsync(dev_a1, host_a + i+N, N*sizeof(int),
cudaMemcpyHostToDevice, stream1);
// ¸´ÖÆÁ÷0Êý¾Ýb
cudaMemcpyAsync(dev_b0, host_b + i, N*sizeof(int),
cudaMemcpyHostToDevice, stream0);
// ¸´ÖÆÁ÷1Êý¾Ýb
cudaMemcpyAsync(dev_b1, host_b + i+N, N*sizeof(int),
cudaMemcpyHostToDevice, stream1);
// Ö´ÐÐÁ÷0ºËº¯Êý
add << <N / 256, 256, 0, stream0 >>
>(dev_a0, dev_b0, dev_c0);
// Ö´ÐÐÁ÷1ºËº¯Êý
add << <N / 256, 256, 0, stream1 >>
>(dev_a1, dev_b1, dev_c1);
// ¸´ÖÆÁ÷0Êý¾Ýc
cudaMemcpyAsync(host_c + i*N, dev_c0, N*sizeof(int),
cudaMemcpyDeviceToHost, stream0);
// ¸´ÖÆÁ÷1Êý¾Ýc
cudaMemcpyAsync(host_c + i*N+N, dev_c1, N*sizeof(int),
cudaMemcpyDeviceToHost, stream1);
}
// Á÷ͬ²½
cudaStreamSynchronize(stream0);
cudaStreamSynchronize(stream1);
// ´¦Àí¼ÆÊ±
cudaEventSynchronize(stop);
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&elapsedTime, start,
stop);
cout << "GPU time: " <<
elapsedTime << "ms" <<
endl;
// Ïú»ÙËùÓпª±ÙµÄÄÚ´æ
cudaFreeHost(host_a); cudaFreeHost(host_b);
cudaFreeHost(host_c);
cudaFree(dev_a0); cudaFree(dev_b0); cudaFree(dev_c0);
cudaFree(dev_a1); cudaFree(dev_b1); cudaFree(dev_c1);
// Ïú»ÙÁ÷ÒÔ¼°¼ÆÊ±Ê¼þ
cudaStreamDestroy(stream0); cudaStreamDestroy(stream1);
cudaEventDestroy(start); cudaEventDestroy(stop);
return 0;
} |
|