mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 14:20:31 +01:00
Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)
Should make results reproducible for different number of threads and batch sizes
This commit is contained in:
parent
404e1da38e
commit
483bab2e3d
@ -727,11 +727,13 @@ static bool llama_eval_internal(
|
|||||||
|
|
||||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||||
struct ggml_tensor * V_trans =
|
struct ggml_tensor * V_trans =
|
||||||
|
ggml_cpy(ctx0,
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||||
n_embd/n_head, n_head, n_past + N),
|
n_embd/n_head, n_head, n_past + N),
|
||||||
1, 2, 0, 3);
|
1, 2, 0, 3),
|
||||||
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||||
|
|
||||||
// KQV = transpose(V) * KQ_soft_max
|
// KQV = transpose(V) * KQ_soft_max
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||||
|
Loading…
Reference in New Issue
Block a user