mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
llama : first working version
This commit is contained in:
parent
af1a096bf8
commit
7ea36953ba
9
ggml.c
9
ggml.c
@ -4105,7 +4105,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|||||||
result->src[0] = ids;
|
result->src[0] = ids;
|
||||||
result->src[1] = b;
|
result->src[1] = b;
|
||||||
|
|
||||||
for (int64_t i = 0; i < n_as; i++) {
|
// TODO: n_as is the selected experts, but it should be the total number of experts
|
||||||
|
//for (int64_t i = 0; i < n_as; i++) {
|
||||||
|
for (int64_t i = 0; i < 8; i++) {
|
||||||
struct ggml_tensor * a = as[i];
|
struct ggml_tensor * a = as[i];
|
||||||
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
||||||
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
||||||
@ -9758,7 +9760,10 @@ static void ggml_compute_forward_mul_mat_id(
|
|||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
|
|
||||||
|
// TODO: this assert seems wrong?
|
||||||
|
//printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id);
|
||||||
|
//GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
|
||||||
|
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||||
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
||||||
|
2
ggml.h
2
ggml.h
@ -217,7 +217,7 @@
|
|||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_PARAMS 1024
|
#define GGML_MAX_PARAMS 1024
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 10
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
@ -4242,14 +4242,18 @@ struct llm_build_context {
|
|||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
const int n_experts_per_tok = 2; // TODO: param
|
// TODO: param
|
||||||
|
const int n_experts = 8;
|
||||||
|
const int n_experts_per_tok = 2;
|
||||||
|
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||||
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
||||||
|
|
||||||
// select experts
|
// select experts
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
|
||||||
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1]
|
//ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1]
|
||||||
|
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts);
|
||||||
weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1]
|
weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1]
|
||||||
|
|
||||||
// compute expert outputs
|
// compute expert outputs
|
||||||
|
Loading…
Reference in New Issue
Block a user