}
else
{
- cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
+ cpx_mtx_knk_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
}
#endif
}
}
+
+void cpx_mtx_knk_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
+{
+ int rowsR = rowsA * rowsB;
+ int colsR = colsA * colsB;
+ for (int i = 0; i < rowsR / 2; i++)
+ {
+ for (int j = 0; j < colsR / 2; j++)
+ {
+ kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i, j);
+ }
+ }
+}
+
/*-----------------------------------------------------------------------------------*/
/*THREADED*/
/*-----------------------------------------------------------------------------------*/
err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
//Run the program
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL);
+ err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL);
gpuerr(clEnqueueNDRangeKernel);
//Wait for completion
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x7d, 0x0a, 0x7d, 0x00
+ 0x7d, 0x0a, 0x0a, 0x7d, 0x00
};
-unsigned int kernel_gpu_len = 4264;
+unsigned int kernel_gpu_len = 4265;