From 85db3d46b5f120fd371c8926f70a979830be5723 Mon Sep 17 00:00:00 2001
From: miha-q <>
Date: Tue, 5 Mar 2024 19:35:47 -0500
Subject: [PATCH] Tue Mar  5 07:35:47 PM EST 2024

---
 examples/slow.txt   |  27 ++++++-----
 src/QAnsel.c        |  10 ++--
 src/kernel.cl       |   9 ++--
 src/kernel.cl.c     | 110 ++++++++++++++++++++++----------------------
 src/kernel_cpu.cl.c |   9 ++--
 5 files changed, 86 insertions(+), 79 deletions(-)

diff --git a/examples/slow.txt b/examples/slow.txt
index 7d34b45..f4126d6 100644
--- a/examples/slow.txt
+++ b/examples/slow.txt
@@ -1,13 +1,18 @@
 //designed to be slow
 qreg q[14];
-h q[0];
-h q[1];
-h q[2];
-h q[3];
-h q[4];
-h q[5];
-h q[6];
-h q[7];
-h q[8];
-h q[9];
-h q[10];
\ No newline at end of file
+x q[0];
+x q[1];
+x q[2];
+x q[3];
+x q[4];
+x q[5];
+x q[6];
+x q[7];
+x q[8];
+x q[9];
+x q[10];
+x q[11];
+x q[12];
+x q[13];
+
+born;
\ No newline at end of file
diff --git a/src/QAnsel.c b/src/QAnsel.c
index 0eca8d2..833970a 100644
--- a/src/QAnsel.c
+++ b/src/QAnsel.c
@@ -12,7 +12,7 @@ FILE* RANDOM_FILE;
 #define GPU_ENABLED
 unsigned char USE_GPU = 1;
 unsigned char USE_THREADS = 1;
-//#define SPEED_TEST
+#define SPEED_TEST
 
 typedef struct
 {
@@ -226,9 +226,9 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 		us1 = get_time();
 		cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		us2 = get_time();
-		printf("\tBare: %lu\n", us2 - us1);
+		printf("\tBare(1): %lu\n", us2 - us1);
 		#else
-		if (USE_GPU && 0) //this one's slower for some reason
+		if (USE_GPU) //this one's slower for some reason
 		{
 			cpx_mtx_knk_metal(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		}
@@ -238,7 +238,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 		}
 		else
 		{
-			cpx_mtx_knk_2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
+			cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		}
 		#endif
 
@@ -1404,7 +1404,7 @@ void main(int argc, char** argv)
 	USE_GPU = cpx_mtx_begin();
 	RANDOM_FILE = fopen("/dev/TrueRNG0", "r");
 	if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r");
-	USE_GPU = 0;
+	USE_GPU = 1;
 	USE_THREADS = 0;
 	process(argc, argv);
 	fclose(RANDOM_FILE);
diff --git a/src/kernel.cl b/src/kernel.cl
index 66c40db..37d178f 100644
--- a/src/kernel.cl
+++ b/src/kernel.cl
@@ -110,9 +110,9 @@ __kernel void kernel_knk_2
     //      carry out effectively a copy of
     //      the dot product procedure.
     const int rowsI = rowsB;
-    const int colsI = rowsB;
+    const int colsI = rowsI;
     const int rowsJ = colsA;
-    const int colsJ = colsA;
+    const int colsJ = rowsJ;
     const int rowsX = rowsA * rowsI;
     const int colsX = colsA * colsI;
     const int rowsY = rowsJ * rowsB;
@@ -146,8 +146,8 @@ __kernel void kernel_knk_2
         const int rowB = rowY % rowsB;
         const int colB = colY % colsB;
 
-        const int posA = rowA * (colsA * 2) + (colsA * 2);
-        const int posB = rowB * (colsB * 2) + (colsB * 2);
+        const int posA = rowA * (colsA * 2) + (colA * 2);
+        const int posB = rowB * (colsB * 2) + (colB * 2);
 
         const float rA = rowI == colI ? ptrA[posA] : 0;
         const float iA = rowI == colI ? ptrA[posA + 1] : 0;
@@ -165,4 +165,5 @@ __kernel void kernel_knk_2
     }
     ptrR[rowR * (colsR * 2) + (colR * 2)] = rR;
     ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR;
+
 }
diff --git a/src/kernel.cl.c b/src/kernel.cl.c
index 0b2ee4e..bcc822b 100644
--- a/src/kernel.cl.c
+++ b/src/kernel.cl.c
@@ -275,12 +275,12 @@ unsigned char src_kernel_cl[] = {
   0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x49, 0x20,
   0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63,
-  0x6f, 0x6c, 0x73, 0x49, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42,
+  0x6f, 0x6c, 0x73, 0x49, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x49,
   0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
   0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x4a, 0x20, 0x3d, 0x20,
   0x63, 0x6f, 0x6c, 0x73, 0x41, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
   0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c,
-  0x73, 0x4a, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x3b, 0x0a,
+  0x73, 0x4a, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x4a, 0x3b, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e,
   0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x58, 0x20, 0x3d, 0x20, 0x72, 0x6f,
   0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x49, 0x3b,
@@ -368,60 +368,60 @@ unsigned char src_kernel_cl[] = {
   0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73,
   0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, 0x2a, 0x20, 0x28,
   0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b,
-  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42,
-  0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, 0x20, 0x28, 0x63,
-  0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20,
-  0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41,
-  0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x49, 0x20, 0x3d, 0x3d, 0x20, 0x63,
-  0x6f, 0x6c, 0x49, 0x20, 0x3f, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70,
-  0x6f, 0x73, 0x41, 0x5d, 0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x72,
-  0x6f, 0x77, 0x49, 0x20, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x49, 0x20,
-  0x3f, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20,
-  0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42, 0x20,
+  0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f,
+  0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28,
+  0x63, 0x6f, 0x6c, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d,
+  0x20, 0x72, 0x6f, 0x77, 0x49, 0x20, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c,
+  0x49, 0x20, 0x3f, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73,
+  0x41, 0x5d, 0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c,
+  0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77,
+  0x49, 0x20, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x49, 0x20, 0x3f, 0x20,
+  0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20,
+  0x31, 0x5d, 0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c,
+  0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77,
+  0x4a, 0x20, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x4a, 0x20, 0x3f, 0x20,
+  0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x20, 0x3a,
+  0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20,
+  0x69, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x4a, 0x20, 0x3d, 0x3d,
+  0x20, 0x63, 0x6f, 0x6c, 0x4a, 0x20, 0x3f, 0x20, 0x70, 0x74, 0x72, 0x42,
+  0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3a,
+  0x20, 0x30, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29,
+  0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x72,
-  0x6f, 0x77, 0x4a, 0x20, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x4a, 0x20,
-  0x3f, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d,
-  0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61,
-  0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x4a, 0x20,
-  0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x4a, 0x20, 0x3f, 0x20, 0x70, 0x74,
-  0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d,
-  0x20, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69,
-  0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a,
+  0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20,
+  0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72,
+  0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73,
-  0x74, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b,
+  0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65,
+  0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74,
-  0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e,
-  0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72,
-  0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c,
-  0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20,
-  0x69, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74,
-  0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x52, 0x20, 0x2b, 0x3d, 0x20,
-  0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65,
-  0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a,
-  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29,
-  0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32,
-  0x29, 0x5d, 0x20, 0x3d, 0x20, 0x72, 0x52, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a,
-  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29,
-  0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32,
-  0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x69, 0x52, 0x3b,
-  0x0a, 0x7d, 0x00
+  0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73,
+  0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x52, 0x20, 0x2b, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b,
+  0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x6f, 0x75,
+  0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70,
+  0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28,
+  0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b,
+  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d,
+  0x20, 0x3d, 0x20, 0x72, 0x52, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70,
+  0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28,
+  0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b,
+  0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20,
+  0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x69, 0x52, 0x3b, 0x0a, 0x0a,
+  0x7d, 0x00
 };
-unsigned int src_kernel_cl_len = 5079;
+unsigned int src_kernel_cl_len = 5078;
diff --git a/src/kernel_cpu.cl.c b/src/kernel_cpu.cl.c
index c304adc..908a689 100644
--- a/src/kernel_cpu.cl.c
+++ b/src/kernel_cpu.cl.c
@@ -104,9 +104,9 @@ void kernel_knk_2
     //      carry out effectively a copy of
     //      the dot product procedure.
     const int rowsI = rowsB;
-    const int colsI = rowsB;
+    const int colsI = rowsI;
     const int rowsJ = colsA;
-    const int colsJ = colsA;
+    const int colsJ = rowsJ;
     const int rowsX = rowsA * rowsI;
     const int colsX = colsA * colsI;
     const int rowsY = rowsJ * rowsB;
@@ -138,8 +138,8 @@ void kernel_knk_2
         const int rowB = rowY % rowsB;
         const int colB = colY % colsB;
 
-        const int posA = rowA * (colsA * 2) + (colsA * 2);
-        const int posB = rowB * (colsB * 2) + (colsB * 2);
+        const int posA = rowA * (colsA * 2) + (colA * 2);
+        const int posB = rowB * (colsB * 2) + (colB * 2);
 
         const float rA = rowI == colI ? ptrA[posA] : 0;
         const float iA = rowI == colI ? ptrA[posA + 1] : 0;
@@ -157,4 +157,5 @@ void kernel_knk_2
     }
     ptrR[rowR * (colsR * 2) + (colR * 2)] = rR;
     ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR;
+
 }
-- 
2.39.5