From b4bae3f9d009d388c18d60965d107fa177ff6cce Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Thu, 4 Dec 2025 22:06:05 -0500 Subject: [PATCH 1/5] q6_k faster mul mat --- .gitignore | 1 + .../vulkan-shaders/mul_mm_funcs.glsl | 18 +++++++++++------- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 428f0841100..05eb578a82f 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,7 @@ /out/ /tmp/ /autogen-*.md +/common/build-info.cpp # Deprecated diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index ee5ded2e8d3..a9f3317b86c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -240,21 +240,25 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 + const uint ib = idx / 64; // 4 values per idx + const uint iqs = (idx % 64) * 2; // 0,2,4..126 const uint n = iqs / 64; // 0,1 - const uint b = (iqs % 64) / 32; // 0,1 + const uint b = ((iqs % 64) / 32) * 4; // 0,4 const uint is_b = (iqs % 16) / 8; // 0,1 const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 - const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint qhi = n * 16 + (iqs % 16); // 0..31 const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32), - dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); + const uint ql = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi]), uint(data_a_packed16[ib].ql[qsi + 1]), 16, 16) >> b) & 0x0F0F0F0F; + const uint qh = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi]), uint(data_a_packed16[ib].qh[qhi + 1]), 16, 16) >> qhshift) & 0x03030303; + const vec4 q = (vec4(unpack8(ql | (qh << 4))) - 32) * dscale; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(q.z, q.w); #elif defined(DATA_A_IQ1_S) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 92bae088b20..22d623526c1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -554,7 +554,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c std::string load_vec_quant = "2"; if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; - else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4")) + else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "q6_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4")) load_vec_quant = "4"; if (tname == "bf16") { From d8c0d030ff863d7f0c5217b705bcb4d24dd8b666 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 5 Dec 2025 21:25:10 -0500 Subject: [PATCH 2/5] 8 values --- .../vulkan-shaders/mul_mm_funcs.glsl | 21 ++++++++++++------- .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 ++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index a9f3317b86c..3b33668ffd4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -240,8 +240,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - const uint ib = idx / 64; // 4 values per idx - const uint iqs = (idx % 64) * 2; // 0,2,4..126 + const uint ib = idx / 32; // 8 values per idx + const uint iqs = (idx % 32) * 4; // 0,4,8..124 const uint n = iqs / 64; // 0,1 const uint b = ((iqs % 64) / 32) * 4; // 0,4 @@ -253,12 +253,17 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - const uint ql = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi]), uint(data_a_packed16[ib].ql[qsi + 1]), 16, 16) >> b) & 0x0F0F0F0F; - const uint qh = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi]), uint(data_a_packed16[ib].qh[qhi + 1]), 16, 16) >> qhshift) & 0x03030303; - const vec4 q = (vec4(unpack8(ql | (qh << 4))) - 32) * dscale; - - buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y); - buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(q.z, q.w); + const uint ql_0 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi]), uint(data_a_packed16[ib].ql[qsi + 1]), 16, 16) >> b) & 0x0F0F0F0F; + const uint ql_1 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi + 2]), uint(data_a_packed16[ib].ql[qsi + 3]), 16, 16) >> b) & 0x0F0F0F0F; + const uint qh_0 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi]), uint(data_a_packed16[ib].qh[qhi + 1]), 16, 16) >> qhshift) & 0x03030303; + const uint qh_1 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi + 2]), uint(data_a_packed16[ib].qh[qhi + 3]), 16, 16) >> qhshift) & 0x03030303; + const vec4 q_0 = (vec4(unpack8(ql_0 | (qh_0 << 4))) - 32) * dscale; + const vec4 q_1 = (vec4(unpack8(ql_1 | (qh_1 << 4))) - 32) * dscale; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(q_0.x, q_0.y); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(q_0.z, q_0.w); + buf_a[buf_idx + 2] = FLOAT_TYPE_VEC2(q_1.x, q_1.y); + buf_a[buf_idx + 3] = FLOAT_TYPE_VEC2(q_1.z, q_1.w); #elif defined(DATA_A_IQ1_S) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 22d623526c1..66066c03399 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -552,9 +552,9 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c for (const auto& tname : type_names) { std::string load_vec_quant = "2"; - if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) + if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q6_k") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; - else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "q6_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4")) + else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4")) load_vec_quant = "4"; if (tname == "bf16") { From def0e1b865519fd9bf4bd8d92d785f55939b6eae Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sat, 6 Dec 2025 15:42:15 -0500 Subject: [PATCH 3/5] fix comment --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index 3b33668ffd4..6d20c60a36f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -248,8 +248,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint is_b = (iqs % 16) / 8; // 0,1 const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 32 + (iqs % 32); // 0..63 - const uint qhi = n * 16 + (iqs % 16); // 0..31 + const uint qsi = n * 32 + (iqs % 32); // 0,4,8..60 + const uint qhi = n * 16 + (iqs % 16); // 0,4,8..28 const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); From 95b1a9b257a7ab0360e168963deec4929167b12e Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:21:05 -0500 Subject: [PATCH 4/5] switch to two at a time --- .../vulkan-shaders/mul_mm_funcs.glsl | 24 +++++++------------ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 +- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index 6d20c60a36f..58ede04400d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -240,30 +240,24 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - const uint ib = idx / 32; // 8 values per idx - const uint iqs = (idx % 32) * 4; // 0,4,8..124 + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 const uint n = iqs / 64; // 0,1 const uint b = ((iqs % 64) / 32) * 4; // 0,4 const uint is_b = (iqs % 16) / 8; // 0,1 const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 32 + (iqs % 32); // 0,4,8..60 - const uint qhi = n * 16 + (iqs % 16); // 0,4,8..28 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint qhi = n * 16 + (iqs % 16); // 0..31 const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - const uint ql_0 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi]), uint(data_a_packed16[ib].ql[qsi + 1]), 16, 16) >> b) & 0x0F0F0F0F; - const uint ql_1 = (bitfieldInsert(uint(data_a_packed16[ib].ql[qsi + 2]), uint(data_a_packed16[ib].ql[qsi + 3]), 16, 16) >> b) & 0x0F0F0F0F; - const uint qh_0 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi]), uint(data_a_packed16[ib].qh[qhi + 1]), 16, 16) >> qhshift) & 0x03030303; - const uint qh_1 = (bitfieldInsert(uint(data_a_packed16[ib].qh[qhi + 2]), uint(data_a_packed16[ib].qh[qhi + 3]), 16, 16) >> qhshift) & 0x03030303; - const vec4 q_0 = (vec4(unpack8(ql_0 | (qh_0 << 4))) - 32) * dscale; - const vec4 q_1 = (vec4(unpack8(ql_1 | (qh_1 << 4))) - 32) * dscale; - - buf_a[buf_idx] = FLOAT_TYPE_VEC2(q_0.x, q_0.y); - buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(q_0.z, q_0.w); - buf_a[buf_idx + 2] = FLOAT_TYPE_VEC2(q_1.x, q_1.y); - buf_a[buf_idx + 3] = FLOAT_TYPE_VEC2(q_1.z, q_1.w); + const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F; + const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303; + const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y); #elif defined(DATA_A_IQ1_S) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 66066c03399..92bae088b20 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -552,7 +552,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c for (const auto& tname : type_names) { std::string load_vec_quant = "2"; - if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q6_k") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) + if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4")) load_vec_quant = "4"; From 124da7f08437ffe5df8a49466ac09ba025ee59ef Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 10 Dec 2025 00:44:11 +0000 Subject: [PATCH 5/5] start ci for .glsl files --- .github/workflows/build.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ad205f3ec96..007fe9b445f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,8 @@ on: '**/*.swift', '**/*.m', '**/*.metal', - '**/*.comp' + '**/*.comp', + '**/*.glsl' ] pull_request: @@ -40,7 +41,8 @@ on: '**/*.swift', '**/*.m', '**/*.metal', - '**/*.comp' + '**/*.comp', + '**/*.glsl' ] concurrency: