20 __kernel
void sgemm(__global
float* restrict matrixA,
21 __global
float* restrict
const matrixB,
22 __global
float* restrict matrixC,
23 const uint matrixOrder,
28 const int i = get_global_id(1);
29 const int j = get_global_id(0);
30 float4 sum = (float4)0.0f;
35 matrixA += i * matrixOrder;
46 for (
int k = 0; k < matrixOrder; k+=4)
48 matrixBColumn.x = matrixB[bOffset];
49 bOffset += matrixOrder;
51 matrixBColumn.y = matrixB[bOffset];
52 bOffset += matrixOrder;
54 matrixBColumn.z = matrixB[bOffset];
55 bOffset += matrixOrder;
57 matrixBColumn.w = matrixB[bOffset];
58 bOffset += matrixOrder;
62 sum += vload4 (0, matrixA) * matrixBColumn;
72 matrixC[i * matrixOrder + j] = alpha * (sum.x + sum.y + sum.z + sum.w) + beta * matrixC[i * matrixOrder + j];