# The third chapter

Let's do an experiment.

It is also the addition of two two-dimensional matrices, which is performed on CUDA.

The code is as follows

```// This code can only run under Windows. Please modify the timing module on Linux
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <device_launch_parameters.h>
#include <Windows.h>
#define abs(a) a>0 ? a : -a
#define CHECK(call) \
{ \
const cudaError_t error=call; \
if(error!=cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n",error,cudaGetErrorString(error)); \
exit(-10*error); \
} \
}
/*
* This example implements matrix element-wise addition on the host and GPU.
* sumMatrixOnHost iterates over the rows and columns of each matrix, adding
* elements from A and B together and storing the results in C. The current
* offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
* implements the same logic, but using CUDA threads to process each matrix.
*/

void initialData(float *ip, const int size)
{
int i;

for(i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
}

void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;

for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}

ia += nx;
ib += nx;
ic += nx;
}

return;
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;

for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
printf("Arrays do not match.\n\n");
break;
}
}
}

// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idx = iy * NX + ix;

if (ix < NX && iy < NY)
{
C[idx] = A[idx] + B[idx];
}
}

int main(int argc, char **argv)
{
// set up device
int dev = 0;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
CHECK(cudaSetDevice(dev));

// set up data size of matrix
int nx = 1 << 14;
int ny = 1 << 14;

int nxy = nx * ny;
int nBytes = nxy * sizeof(float);

// malloc host memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);

// initialize data at host side
LARGE_INTEGER ta, tb, tc;
QueryPerformanceFrequency(&tc);
QueryPerformanceCounter(&ta);
printf("Generating random array...\n");
initialData(h_A, nxy);
initialData(h_B, nxy);
QueryPerformanceCounter(&tb);
printf("Generated.\n");

memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);

// add matrix at host side for result checks
QueryPerformanceCounter(&ta);
sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
QueryPerformanceCounter(&tb);

// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
CHECK(cudaMalloc((void **)&d_MatA, nBytes));
CHECK(cudaMalloc((void **)&d_MatB, nBytes));
CHECK(cudaMalloc((void **)&d_MatC, nBytes));

// transfer data from host to device
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));

// invoke kernel at host side
int dimx = 32;
int dimy = 32;

if(argc > 2)
{
dimx = atoi(argv);
dimy = atoi(argv);
}

dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

// execute the kernel
QueryPerformanceCounter(&ta);
sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
QueryPerformanceCounter(&tb);
CHECK(cudaGetLastError());

// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));

// check device results
checkResult(hostRef, gpuRef, nxy);

// free device global memory
CHECK(cudaFree(d_MatA));
CHECK(cudaFree(d_MatB));
CHECK(cudaFree(d_MatC));

// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);

// reset device

return EXIT_SUCCESS;
}
```

The computer configuration of the author has also been introduced before, which is the NVIDIA GTX 1050 video card. This is the result given in the book, which is used to tell us that more thread blocks have better parallelism (512 * 1024 block number performance is better than 512 * 512). The results of running on my computer are as follows (Debug mode): It can be seen that when the block size is 32 * 32, it is the slowest. In fact, the efficiency of 16 * 32, 16 * 16 and 32 * 16 are almost the same. The results are different from those in the book. And in Release mode, the speed of the four is almost the same.

Use nvprof -- metrics achieved ﹣ occupancy < program. Exe > [Param1] [param2] The command can get the following data:  It can be seen that the fourth situation has the highest occupancy rate of thread bundles, but the speed is the second slowest, so we can still draw a conclusion: high occupancy rate does not necessarily get the fastest running speed, but also related to other factors.

Tags: Windows Linux

Posted on Sat, 09 Nov 2019 11:46:52 -0500 by gamefreak13