天天看點

向量相加CUDA練習

1 #include<string.h>
 2 #include<math.h>
 3 #include<stdlib.h>
 4 #include<stdio.h>
 5 #define N 100
 6 
 7 __global__ void vecAdd(float* A,float* B,float* C){
 8     int i=threadIdx.x;
 9     if(i<N)
10         C[i]=A[i]+B[i];
11 }
12 
13 
14 
15 
16 int main(int argc,int argv){
17     size_t size=N*sizeof(float);
18     float *h_A,*h_B,*h_C;
19     h_A=(float*)malloc(size);
20     h_B=(float*)malloc(size);
21     h_C=(float*)malloc(size);
22     float* d_A;
23     cudaMalloc((void**)&d_A,size);
24     float* d_B;
25     cudaMalloc((void**)&d_B,size);
26     float* d_C;
27     cudaMalloc((void**)&d_C,size);
28     srand(time(NULL));
29     for(int i=0;i<N;i++){
30         h_A[i]=rand()%100;
31         h_B[i]=rand()%100;
32     }
33     cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
34     cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);
35     int threadsPerBlock=256;
36     int threadsPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
37     vecAdd<<<threadsPerGrid,threadsPerBlock>>>(d_A,d_B,d_C);
38     cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
39     for(int i=0;i<N;i++){
40         printf("%5.0d:%.0f+%.0f=%.0f\n",i,h_A[i],h_B[i],h_C[i]);
41     }
42     free(h_A);
43     free(h_B);
44     free(h_C);
45 
46     cudaFree(d_A);
47     cudaFree(d_B);
48     cudaFree(d_C);
49 }      

轉載于:https://www.cnblogs.com/zhangchengbing/p/5063278.html