diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 6b874aad4..bce578ceb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -383,8 +383,10 @@ shared uvec2 iq2xxs_grid[256]; void init_iq_shmem() { // copy the table into shared memory and sync - for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += gl_WorkGroupSize.x) { - iq2xxs_grid[i] = iq2xxs_grid_const[i]; + if (gl_LocalInvocationIndex.x < 32) { + for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += 32) { + iq2xxs_grid[i] = iq2xxs_grid_const[i]; + } } barrier(); } @@ -550,8 +552,10 @@ shared uvec2 iq2xs_grid[512]; void init_iq_shmem() { // copy the table into shared memory and sync - for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += gl_WorkGroupSize.x) { - iq2xs_grid[i] = iq2xs_grid_const[i]; + if (gl_LocalInvocationIndex.x < 32) { + for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += 32) { + iq2xs_grid[i] = iq2xs_grid_const[i]; + } } barrier(); } @@ -839,8 +843,10 @@ shared uvec2 iq2s_grid[1024]; void init_iq_shmem() { // copy the table into shared memory and sync - for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += gl_WorkGroupSize.x) { - iq2s_grid[i] = iq2s_grid_const[i]; + if (gl_LocalInvocationIndex.x < 32) { + for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += 32) { + iq2s_grid[i] = iq2s_grid_const[i]; + } } barrier(); } @@ -907,8 +913,10 @@ shared uint32_t iq3xxs_grid[256]; void init_iq_shmem() { // copy the table into shared memory and sync - for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += gl_WorkGroupSize.x) { - iq3xxs_grid[i] = iq3xxs_grid_const[i]; + if (gl_LocalInvocationIndex.x < 32) { + for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += 32) { + iq3xxs_grid[i] = iq3xxs_grid_const[i]; + } } barrier(); } @@ -1014,8 +1022,10 @@ shared uint32_t iq3s_grid[512]; void init_iq_shmem() { // copy the table into shared memory and sync - for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += gl_WorkGroupSize.x) { - iq3s_grid[i] = iq3s_grid_const[i]; + if (gl_LocalInvocationIndex.x < 32) { + for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += 32) { + iq3s_grid[i] = iq3s_grid_const[i]; + } } barrier(); }