4.4.1 内存带宽
理论带宽是当前硬件可以实现的绝对最大带宽。在cuda中获取
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
printf("Peak Memory Bandwidth (GB/s): %f\n",2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6);
float pbnd = 2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6;
4.4.2 矩阵转置问题
矩阵转置是线性代数中一个基本问题。虽然是基本问题,但却在许多应用程序中被使用。
观察输入和输出布局,你会注意到:·读:通过原矩阵的行进行访问,结果为合并访问·写:通过转置矩阵的列进行访问,结果为交叉访问。交叉访问是使GPU性能变得最差的内存访问模式。但是,在矩阵转置操作中这是不可避免的。
CUDA代码实现:
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
void tranposeHost_test(float* out, float* in, const int nx, const int ny){
for (int i = 0; i < nx ; i ++){
for (int j = 0; j < ny; j++){
out[i*ny + j] = in[j * nx + i];
}
}
}
void tranposeHost(float* out, float* in, const int nx, const int ny){
for (int iy = 0; iy < ny ; iy ++){
for (int ix = 0; ix < nx; ix++){
out[ix * ny + iy] = in[iy * nx + ix];
}
}
}
__global__ void warmup( float * out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < nx && iy < ny){
out[iy*nx + ix] = in[iy*nx + ix];
}
}
// load row store row
__global__ void copyRow( float * out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < nx && iy < ny){
out[iy*nx + ix] = in[iy*nx + ix];
}
}
// load col store col
__global__ void copyCol( float * out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < nx && iy < ny){
out[ix*ny + iy] = in[ix*ny + iy];
}
}
// load row, store col
__global__ void tranposeNaiveRow(float* out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < nx && iy < ny){
out[ix*ny + iy] = in[iy*nx + ix];
}
}
// load col, store row
__global__ void tranposeNaiveCol(float* out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix < nx && iy < ny){
out[iy*nx + ix] = in[ix*ny + iy];
}
}
void initialData(float *ip, int nx, int ny)
{
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < nx; i++) {
for (int j =0; j < ny; j ++){
ip[i + j * nx] = (float) (rand() & 0xff) / 10.0f;
}
}
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i])> epsilon)
{
match = 0;
printf("Array do not match\n");
printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("array matches\n");
}
int main(int argc, char** argv){
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
printf("Peak Memory Bandwidth (GB/s): %f\n",2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6);
float pbnd = 2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6;
int nx = 1 << 11;
int ny = 1 << 11;
int iKernel = 0;
int blockx = 16;
int blocky = 16;
if(argc > 1) iKernel = atoi(argv[1]);
if(argc > 2) blockx = atoi(argv[2]);
if(argc > 3) blocky = atoi(argv[3]);
if(argc > 4) nx = atoi(argv[4]);
if(argc > 5) ny = atoi(argv[5]);
printf("matrix nx %d ny %d with kernel %d\n", nx, ny, iKernel);
size_t nBytes = nx * ny * sizeof(float);
dim3 block (blockx, blocky);
dim3 grid ((nx + block.x-1)/block.x, (ny + block.y -1 )/ block.y);
float *h_A = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);
initialData(h_A, nx, ny);
float *d_A, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
// copy data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
Timer timer;
timer.start();
warmup<<<grid,block>>>(d_A, d_C, nx, ny);
cudaDeviceSynchronize();
timer.stop();
float elapsedTime = timer.elapsedms();
printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);
// kernel pointer
void (*kernel)(float*, float*,int, int);
char *kernelName;
switch (iKernel){
case 0:
kernel = ©Row;
kernelName = "COPYROW";
break;
case 1:
kernel = ©Col;
kernelName = "COPYCOL";
break;
case 2:
kernel = &tranposeNaiveRow;
kernelName = "tranposeNaiveRow";
break;
case 3:
kernel = &tranposeNaiveCol;
kernelName = "tranposeNaiveCol";
break;
}
timer.start();
kernel<<<grid,block>>>(d_C,d_A , nx, ny);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
float ibnd = 2*nx * ny * sizeof(float)/1e6 / elapsedTime;
printf("%s elapsed %f ms <<<grid(%d, %d) block (%d, %d)>>> effective bandwidth %f GB/s, ratio %f%% \n", kernelName, elapsedTime, grid.x , grid.y, block.x, block.y, ibnd, ibnd/pbnd * 100);
if (iKernel > 1){
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
checkResult(hostRef, gpuRef, nx*ny);
}
cudaFree(d_A);
cudaFree(d_C);
free(h_A);
free(hostRef);
free(gpuRef);
cudaDeviceReset();
return 0;
}
里面一共有4个函数,执行
transpose.exe 0
COPYROW elapsed 0.049184 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 682.222534 GB/s, ratio 67.674362%
transpose.exe 1
COPYCOL elapsed 0.060224 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 557.160461 GB/s, ratio 55.268593%
transpose.exe 2
tranposeNaiveRow elapsed 0.098144 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 341.889801 GB/s, ratio 33.914410%
transpose.exe 3
tranposeNaiveCol elapsed 0.048128 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 697.191467 GB/s, ratio 69.159233%
核函数:
copyrow – 复制一个矩阵,用行加载,用行存储;67.674362% 与峰值带宽比值
copycol – 复制一个矩阵,用列加载,用列存储;55.268593%
tranposeNaiveRow – 转置一个矩阵,用行加载,用列存储;33.914410%
tranposeNaiveCol – 转置一个矩阵,用列加载,用行存储; 69.159233%
结果表明,使用NaiveCol方法比NaiveRow方法性能表现得更好。导致这种性能提升的一个可能原因是在缓存中执行了交叉读取。
4.4.3 展开转置
展开的目的是为每个线程分配更独立的任务,从而最大化当前内存请求。
添加核函数:
__global__ void tranposeUnroll4Row(float* out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int ti = iy*nx + ix; //access in rows
unsigned int to = ix*ny + iy; //access in cols
if (ix + 3 * blockDim.x < nx && iy < ny){
out[to] = in[ti];
out[to + ny * blockDim.x] = in[ti + blockDim.x];
out[to + ny * 2 * blockDim.x] = in[ti + 2 * blockDim.x];
out[to + ny * 3 * blockDim.x] = in[ti + 3 * blockDim.x];
}
}
__global__ void tranposeUnroll4Col(float* out, float* in, const int nx, const int ny){
unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int ti = ix*ny + iy; //access in cols
unsigned int to = iy*nx + ix; //access in rows
if (ix + 3 * blockDim.x < nx && iy < ny){
out[to] = in[ti];
out[to + blockDim.x] = in[ti + blockDim.x * ny];
out[to + blockDim.x * 2] = in[ti + blockDim.x * ny * 2];
out[to + blockDim.x * 3] = in[ti + blockDim.x * ny * 3];
}
}
向核函数switch语句中添加以下代码段,需要修改grid.x
case 4:
kernel = &tranposeUnroll4Row;
kernelName = "tranposeUnroll4Row";
grid.x = (nx + block.x * 4 - 1)/(block.x * 4);
break;
case 5:
kernel = &tranposeUnroll4Col;
kernelName = "tranposeUnroll4Col";
grid.x = (nx + block.x * 4 - 1)/(block.x * 4);
break;
transpose.exe 4
tranposeUnroll4Row elapsed 0.149536 ms <<<grid(32, 128) block (16, 16)>>> effective bandwidth 224.390335 GB/s, ratio 22.258825%
transpose.exe 5
tranposeUnroll4Col elapsed 0.049824 ms <<<grid(32, 128) block (16, 16)>>> effective bandwidth 673.459229 GB/s, ratio 66.805069%
4.4.4 对角转置
当启用一个线程块的网格时,线程块会被分配给SM。编程模型抽象可能用一个一维或二维布局来表示该网格,但是从硬件的角度来看,所有块都是一维的。二维线程块ID可以表示为:
int bid = blockIdx.y * gridDim.x + blockIdx.x;
当启用一个核函数时,线程块被分配给SM的顺序由块ID来确定。一旦所有的SM都被完全占用,所有剩余的线程块都保持不变直到当前的执行被完成。一旦一个线程块执行结束,将为该SM分配另一个线程块。由于线程块完成的速度和顺序是不确定的,随着内核进程的执行,起初通过bid相连的活跃线程块会变得不太连续了。尽管无法直接调控线程块的顺序,但你可以灵活地使用块坐标blockIdx.x和blockIdx.y。
如图,是笛卡尔坐标系下的块坐标:
如图是对角块坐标系:
对角坐标系用于确定一维线程块的ID,但对于数据访问,仍需使用笛卡尔坐标系。因此,当用对角坐标表示块ID时,需要将对角坐标映射到笛卡尔坐标中,以便可以访问到正确的数据块。
block_x = (blockIdx.x + blockIdx.y) % gridDim.x;
block_y = blockIdx.x; //如上图,每一行的x坐标等于行号
这里,blockIdx.x和blockIdx.y为对角坐标。block_x和block_y是它们对应的笛卡尔坐标。
下面的核函数使用了对角线程块坐标,它借助合并读取和交叉写入实现了矩阵的转置:
__global__ void tranposediagonalRow(float* out, float* in, const int nx, const int ny){
unsigned int blk_y = blockIdx.x;
unsigned int blk_x = (blockIdx.x + blockIdx.y) % gridDim.x;
unsigned int ix = blockDim.x * blk_x + threadIdx.x;
unsigned int iy = blockDim.y * blk_y + threadIdx.y;
if (ix < nx && iy < ny){
out[ix*ny + iy] = in[iy*nx + ix];
}
}
__global__ void tranposediagonalCol(float* out, float* in, const int nx, const int ny){
unsigned int blk_y = blockIdx.x;
unsigned int blk_x = (blockIdx.x + blockIdx.y) % gridDim.x;
unsigned int ix = blockDim.x * blk_x + threadIdx.x;
unsigned int iy = blockDim.y * blk_y + threadIdx.y;
if (ix < nx && iy < ny){
out[iy*nx + ix] = in[ix*ny + iy];
}
}
向核函数switch语句中添加以下代码来调用这些核函数:
case 6:
kernel = &tranposediagonalRow;
kernelName = "tranposediagonalRow";
break;
case 7:
kernel = &tranposediagonalCol;
kernelName = "tranposediagonalCol";
break;
transpose.exe 6
tranposediagonalRow elapsed 0.094048 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 356.779846 GB/s, ratio 35.391457%
transpose.exe 7
tranposediagonalCol elapsed 0.045056 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 744.727295 GB/s, ratio 73.874641%
4.4.5 使用瘦块来增加并行性
增加并行性最简单的方式是调整块的大小。之前的几节内容已经证明了这种简单的技术对提高性能的有效性。进一步对使用基于列的NaiveCol核函数的块大小进行试验(通过核函数switch语句中的case 3进行访问)
初始的块大小是 16,16。
基于case3修改块大小命令: transpose.exe 3 32 32 , 以此类推
block.x = 8
tranposeNaiveCol elapsed 0.041216 ms <<<grid(256, 64) block (8, 32)>>> effective bandwidth 814.111755 GB/s, ratio 80.757362%
tranposeNaiveCol elapsed 0.049152 ms <<<grid(256, 128) block (8, 16)>>> effective bandwidth 682.666626 GB/s, ratio 67.718414%
tranposeNaiveCol elapsed 0.064384 ms <<<grid(256, 256) block (8, 8)>>> effective bandwidth 521.161072 GB/s, ratio 51.697563%
block.x = 16
tranposeNaiveCol elapsed 0.056288 ms <<<grid(128, 64) block (16, 32)>>> effective bandwidth 596.120544 GB/s, ratio 59.133308%
tranposeNaiveCol elapsed 0.090080 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 372.495911 GB/s, ratio 36.950439%
tranposeNaiveCol elapsed 0.061216 ms <<<grid(128, 256) block (16, 8)>>> effective bandwidth 548.131714 GB/s, ratio 54.372967%
block.x = 32
tranposeNaiveCol elapsed 0.066752 ms <<<grid(64, 64) block (32, 32)>>> effective bandwidth 502.673035 GB/s, ratio 49.863605%
tranposeNaiveCol elapsed 0.047936 ms <<<grid(64, 128) block (32, 16)>>> effective bandwidth 699.984009 GB/s, ratio 69.436249%
tranposeNaiveCol elapsed 0.062560 ms <<<grid(64, 256) block (32, 8)>>> effective bandwidth 536.356018 GB/s, ratio 53.204853%
以上结果相同setting run,每次结果都有不同。但 (8,32)的效果不错。理论依据是如下图:
通过增加存储在线程块中连续元素的数量,“瘦”块可以提高存储操作的效率