1 #include<string.h>
2 #include<math.h>
3 #include<stdlib.h>
4 #include<stdio.h>
5 #define N 100
6
7 __global__ void vecAdd(float* A,float* B,float* C){
8 int i=threadIdx.x;
9 if(i<N)
10 C[i]=A[i]+B[i];
11 }
12
13
14
15
16 int main(int argc,int argv){
17 size_t size=N*sizeof(float);
18 float *h_A,*h_B,*h_C;
19 h_A=(float*)malloc(size);
20 h_B=(float*)malloc(size);
21 h_C=(float*)malloc(size);
22 float* d_A;
23 cudaMalloc((void**)&d_A,size);
24 float* d_B;
25 cudaMalloc((void**)&d_B,size);
26 float* d_C;
27 cudaMalloc((void**)&d_C,size);
28 srand(time(NULL));
29 for(int i=0;i<N;i++){
30 h_A[i]=rand()%100;
31 h_B[i]=rand()%100;
32 }
33 cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
34 cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);
35 int threadsPerBlock=256;
36 int threadsPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
37 vecAdd<<<threadsPerGrid,threadsPerBlock>>>(d_A,d_B,d_C);
38 cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
39 for(int i=0;i<N;i++){
40 printf("%5.0d:%.0f+%.0f=%.0f\n",i,h_A[i],h_B[i],h_C[i]);
41 }
42 free(h_A);
43 free(h_B);
44 free(h_C);
45
46 cudaFree(d_A);
47 cudaFree(d_B);
48 cudaFree(d_C);
49 }
轉載于:https://www.cnblogs.com/zhangchengbing/p/5063278.html