mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-04 23:52:32 +01:00
Restore hardcoded 32 in init_iq_shmem (AMD performance regression)
This commit is contained in:
parent
6ed3047e41
commit
3f7aa9dcb1
@ -383,9 +383,11 @@ shared uvec2 iq2xxs_grid[256];
|
|||||||
void init_iq_shmem()
|
void init_iq_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += gl_WorkGroupSize.x) {
|
if (gl_LocalInvocationIndex.x < 32) {
|
||||||
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += 32) {
|
||||||
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -550,9 +552,11 @@ shared uvec2 iq2xs_grid[512];
|
|||||||
void init_iq_shmem()
|
void init_iq_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += gl_WorkGroupSize.x) {
|
if (gl_LocalInvocationIndex.x < 32) {
|
||||||
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += 32) {
|
||||||
iq2xs_grid[i] = iq2xs_grid_const[i];
|
iq2xs_grid[i] = iq2xs_grid_const[i];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -839,9 +843,11 @@ shared uvec2 iq2s_grid[1024];
|
|||||||
void init_iq_shmem()
|
void init_iq_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += gl_WorkGroupSize.x) {
|
if (gl_LocalInvocationIndex.x < 32) {
|
||||||
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += 32) {
|
||||||
iq2s_grid[i] = iq2s_grid_const[i];
|
iq2s_grid[i] = iq2s_grid_const[i];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -907,9 +913,11 @@ shared uint32_t iq3xxs_grid[256];
|
|||||||
void init_iq_shmem()
|
void init_iq_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += gl_WorkGroupSize.x) {
|
if (gl_LocalInvocationIndex.x < 32) {
|
||||||
|
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += 32) {
|
||||||
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1014,9 +1022,11 @@ shared uint32_t iq3s_grid[512];
|
|||||||
void init_iq_shmem()
|
void init_iq_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += gl_WorkGroupSize.x) {
|
if (gl_LocalInvocationIndex.x < 32) {
|
||||||
|
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += 32) {
|
||||||
iq3s_grid[i] = iq3s_grid_const[i];
|
iq3s_grid[i] = iq3s_grid_const[i];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user