ggml-org · netrunnereve · Dec 5, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -54,6 +54,7 @@
 /out/
 /tmp/
 /autogen-*.md
+/common/build-info.cpp
 
 # Deprecated
 

@@ -240,21 +240,30 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
             const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
             const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
 
-            const uint ib = idx / 128;                  // 2 values per idx
-            const uint iqs = idx % 128;                 // 0..127
+            const uint ib = idx / 32;                   // 8 values per idx
+            const uint iqs = (idx % 32) * 4;            // 0,4,8..124
 
             const uint n = iqs / 64;                    // 0,1
-            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint b = ((iqs % 64) / 32) * 4;       // 0,4
             const uint is_b = (iqs % 16) / 8;           // 0,1
             const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
             const uint is = 8 * n + qhshift + is_b;     // 0..15
-            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
-            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+            const uint qsi = n * 32 + (iqs % 32);       // 0,4,8..60
+            const uint qhi = n * 16 + (iqs % 16);       // 0,4,8..28
 
             const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
 
-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32),
-                                             dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+            const uint ql_0 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi]), uint(data_a_packed16[ib].ql[qsi + 1]), 16, 16) >> b) & 0x0F0F0F0F;
+            const uint ql_1 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi + 2]), uint(data_a_packed16[ib].ql[qsi + 3]), 16, 16) >> b) & 0x0F0F0F0F;
+            const uint qh_0 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi]), uint(data_a_packed16[ib].qh[qhi + 1]), 16, 16) >> qhshift) & 0x03030303;
+            const uint qh_1 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi + 2]), uint(data_a_packed16[ib].qh[qhi + 3]), 16, 16) >> qhshift) & 0x03030303;
+            const vec4 q_0 = (vec4(unpack8(ql_0 | (qh_0 << 4))) - 32) * dscale;
+            const vec4 q_1 = (vec4(unpack8(ql_1 | (qh_1 << 4))) - 32) * dscale;
+
+            buf_a[buf_idx] = FLOAT_TYPE_VEC2(q_0.x, q_0.y);
+            buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(q_0.z, q_0.w);
+            buf_a[buf_idx + 2] = FLOAT_TYPE_VEC2(q_1.x, q_1.y);
+            buf_a[buf_idx + 3] = FLOAT_TYPE_VEC2(q_1.z, q_1.w);
 #elif defined(DATA_A_IQ1_S)
             const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
             const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

@@ -552,7 +552,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
 
     for (const auto& tname : type_names) {
         std::string load_vec_quant = "2";
-        if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
+        if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q6_k") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
             load_vec_quant = "8";
         else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
             load_vec_quant = "4";
-Original file line number
+Diff line change
@@ Expand Up / @@ -54,6 +54,7 @@ @@
     /out/
     /tmp/
     /autogen-*.md
+    /common/build-info.cpp
     # Deprecated
@@ Expand Down @@