Restore hardcoded 32 in init_iq_shmem (AMD performance regression)

This commit is contained in:
Rémy O 2025-01-25 14:26:02 +01:00
parent 6ed3047e41
commit 3f7aa9dcb1

View File

@ -383,8 +383,10 @@ shared uvec2 iq2xxs_grid[256];
void init_iq_shmem()
{
// copy the table into shared memory and sync
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += gl_WorkGroupSize.x) {
iq2xxs_grid[i] = iq2xxs_grid_const[i];
if (gl_LocalInvocationIndex.x < 32) {
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += 32) {
iq2xxs_grid[i] = iq2xxs_grid_const[i];
}
}
barrier();
}
@ -550,8 +552,10 @@ shared uvec2 iq2xs_grid[512];
void init_iq_shmem()
{
// copy the table into shared memory and sync
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += gl_WorkGroupSize.x) {
iq2xs_grid[i] = iq2xs_grid_const[i];
if (gl_LocalInvocationIndex.x < 32) {
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += 32) {
iq2xs_grid[i] = iq2xs_grid_const[i];
}
}
barrier();
}
@ -839,8 +843,10 @@ shared uvec2 iq2s_grid[1024];
void init_iq_shmem()
{
// copy the table into shared memory and sync
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += gl_WorkGroupSize.x) {
iq2s_grid[i] = iq2s_grid_const[i];
if (gl_LocalInvocationIndex.x < 32) {
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += 32) {
iq2s_grid[i] = iq2s_grid_const[i];
}
}
barrier();
}
@ -907,8 +913,10 @@ shared uint32_t iq3xxs_grid[256];
void init_iq_shmem()
{
// copy the table into shared memory and sync
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += gl_WorkGroupSize.x) {
iq3xxs_grid[i] = iq3xxs_grid_const[i];
if (gl_LocalInvocationIndex.x < 32) {
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += 32) {
iq3xxs_grid[i] = iq3xxs_grid_const[i];
}
}
barrier();
}
@ -1014,8 +1022,10 @@ shared uint32_t iq3s_grid[512];
void init_iq_shmem()
{
// copy the table into shared memory and sync
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += gl_WorkGroupSize.x) {
iq3s_grid[i] = iq3s_grid_const[i];
if (gl_LocalInvocationIndex.x < 32) {
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += 32) {
iq3s_grid[i] = iq3s_grid_const[i];
}
}
barrier();
}