編寫代碼
首先将上次的轉灰階圖的程式拷過來用于生成灰階圖
共編寫了cpu、gpu_wrong_naive、gpu_naive、gpu_usesharemem四種方式實作
cpu版本
cpu版本代碼很簡單:
void getGrayHistincpu(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth)
{
for(int i = 0; i < imgheight; i++)
{
for (int j = 0; j < imgwidth; j++)
{
hist[grayData[i*imgwidth+j]]++;
}
}
}
gpu版本1——直接照搬 gpu_wrong_naive
__global__ void getGrayHistincuda_wrong_naive(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //會發生沖突,數值每次會變化
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
hist[value]++;
}
}
這個代碼有問題,因為各個線程會同時通路同一塊全局記憶體,數值會不正确
gpu版本2——原子操作 gpu_naive
__global__ void getGrayHistincuda_naive(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //使用原子操作保證數值正确
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
atomicAdd(&(hist[value]), 1);
}
}
這個代碼結果是正确的,但是因為有很多線程(我這裡設定的是一個像素交由一個線程處理)互相競争全局記憶體,速度并不快
gpu版本3——共享記憶體 gpu_usesharemem
__global__ void getGrayHistincuda_usesharemem(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //使用共享記憶體加速
{
__shared__ unsigned int hist_shared[256]; //共享記憶體僅線上程塊内共享
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
const unsigned char inner_idx = threadIdx.y * blockDim.x + threadIdx.x;
hist_shared[inner_idx%256] = 0; //清空資料,由于每個塊的inner_idx可以超過256,是以這樣可以保證hist_shared被全部清零
__syncthreads(); //等待其他線程完成
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
atomicAdd(&(hist_shared[value]), 1);
}
__syncthreads();
if(threadIdx.y < 8) //每個線程塊将自己共享記憶體中的值合并到全局記憶體中去
{
atomicAdd(&(hist[inner_idx]), hist_shared[inner_idx]);
}
}
這個程式比上一個速度更快,加速的原因有二:
- 使用了更快的共享記憶體
- 共享記憶體由線程塊獨占,是以各個線程塊在寫入共享記憶體時,不會與其他線程塊沖突;另外,在合并共享記憶體時,也減少了沖突
測試
正确性
以cpu代碼做參考,隻有gpu_wrong_naive是錯的,原因是沒有進行原子操作加法
速度
方法 | 時間 |
---|---|
cpu | 0.00069200 |
gpu_wrong_naive | 0.00013200 |
gpu_naive | 0.00021600 |
gpu_use_share_mem | 0.00011300 |
可以看到,使用共享記憶體确實可以加速
源碼
#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
Mat rgb2gray(Mat& srcImage);
void getGrayHistincpu(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth)
{
for(int i = 0; i < imgheight; i++)
{
for (int j = 0; j < imgwidth; j++)
{
hist[grayData[i*imgwidth+j]]++;
}
}
}
__global__ void getGrayHistincuda_wrong_naive(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //會發生沖突,數值每次會變化
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
hist[value]++;
}
}
__global__ void getGrayHistincuda_naive(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //使用原子操作保證數值正确
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
atomicAdd(&(hist[value]), 1);
}
}
__global__ void getGrayHistincuda_usesharemem(unsigned char * const grayData,
unsigned int * const hist,
uint imgheight,
uint imgwidth) //使用共享記憶體加速
{
__shared__ unsigned int hist_shared[256]; //共享記憶體僅線上程塊内共享
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;
const unsigned char inner_idx = threadIdx.y * blockDim.x + threadIdx.x;
hist_shared[inner_idx%256] = 0; //清空資料,由于每個塊的inner_idx可以超過256,是以這樣可以保證hist_shared被全部清零
__syncthreads(); //等待其他線程完成
if(idx < imgwidth && idy < imgheight)
{
const unsigned long pid = imgwidth * idy + idx;
const unsigned char value = grayData[pid];
atomicAdd(&(hist_shared[value]), 1);
}
__syncthreads();
if(threadIdx.y < 8) //每個線程塊将自己共享記憶體中的值合并到全局記憶體中去
{
atomicAdd(&(hist[inner_idx]), hist_shared[inner_idx]);
}
}
#define PHASE 3
int main(void)
{
Mat srcImage = imread("./test.jpg");
Mat grayImage = rgb2gray(srcImage);
const uint imgheight = grayImage.rows;
const uint imgwidth = grayImage.cols;
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1)/threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1)/threadsPerBlock.y);
unsigned char *gpuGrayData;
unsigned int *gpuGrayHist;
unsigned int grayHist[256] = {0};
cudaMalloc((void**)&gpuGrayData, imgwidth*imgheight*sizeof(unsigned char));
cudaMalloc((void**)&gpuGrayHist, 256*(sizeof(unsigned int)));
cudaMemcpy(gpuGrayData, grayImage.data, imgwidth*imgheight*sizeof(unsigned char), cudaMemcpyHostToDevice);
clock_t start, end;
#if PHASE == 0
start = clock();
getGrayHistincpu(grayImage.data, grayHist, imgheight, imgwidth);
end = clock();
for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
printf("cuda_wrong_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);
#elif PHASE == 1
memset(grayHist, 0, 256*sizeof(unsigned int));
cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice); //清零gpu記憶體
start = clock();
getGrayHistincuda_wrong_naive<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist,
imgheight, imgwidth);
cudaDeviceSynchronize();
end = clock();
cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
printf("cuda_wrong_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);
#elif PHASE == 2
memset(grayHist, 0, 256*sizeof(unsigned int));
cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice); //清零gpu記憶體
start = clock();
getGrayHistincuda_naive<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist,
imgheight, imgwidth);
cudaDeviceSynchronize();
end = clock();
cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
printf("cuda_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);
#elif PHASE == 3
memset(grayHist, 0, 256*sizeof(unsigned int));
cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice); //清零gpu記憶體
start = clock();
getGrayHistincuda_usesharemem<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist,
imgheight, imgwidth);
cudaDeviceSynchronize();
end = clock();
cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
printf("cuda_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);
#endif
cudaFree(gpuGrayData);
cudaFree(gpuGrayHist);
return 0;
}