Use correct type of pooling for embedding models (#5500)

Use correct type of pooling for embedding models
This commit is contained in:
Douglas Hanley 2024-02-15 11:21:49 -06:00 committed by GitHub
parent c06e45d729
commit 4524290e87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 94 additions and 31 deletions

View File

@ -1650,7 +1650,29 @@ class BertModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
self.gguf_writer.add_causal_attention(False) self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_pooling_layer(True)
# get pooling path
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
modules = json.load(f)
pooling_path = None
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break
# get pooling type
pooling_type = gguf.PoolingType.NONE
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
pooling_type = gguf.PoolingType.CLS
else:
raise NotImplementedError("Only MEAN and CLS pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type.value)
def set_vocab(self): def set_vocab(self):
path = self.dir_model path = self.dir_model

View File

@ -40,7 +40,7 @@ class Keys:
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count" EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_LAYER = "{arch}.pooling_layer" POOLING_TYPE = "{arch}.pooling_type"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -561,6 +561,12 @@ class RopeScalingType(Enum):
YARN = 'yarn' YARN = 'yarn'
class PoolingType(IntEnum):
NONE = 0
MEAN = 1
CLS = 2
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1

View File

@ -19,6 +19,7 @@ from .constants import (
GGUFValueType, GGUFValueType,
Keys, Keys,
RopeScalingType, RopeScalingType,
PoolingType,
TokenType, TokenType,
) )
@ -360,8 +361,8 @@ class GGUFWriter:
def add_causal_attention(self, value: bool) -> None: def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
def add_pooling_layer(self, value: bool) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

View File

@ -256,7 +256,7 @@ enum llm_kv {
LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_COUNT,
LLM_KV_EXPERT_USED_COUNT, LLM_KV_EXPERT_USED_COUNT,
LLM_KV_POOLING_LAYER, LLM_KV_POOLING_TYPE,
LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@ -1561,7 +1561,7 @@ struct llama_hparams {
float f_max_alibi_bias; float f_max_alibi_bias;
bool causal_attn = true; bool causal_attn = true;
bool pooling_layer = false; uint32_t pooling_type = LLAMA_POOLING_NONE;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
@ -1924,7 +1924,8 @@ struct llama_context {
struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
struct ggml_tensor * inp_K_shift; // I32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch]
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL; ggml_mpi_context * ctx_mpi = NULL;
@ -3086,7 +3087,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 3: case 3:
@ -3107,7 +3108,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
if (hparams.n_layer == 12 && hparams.n_embd == 768) { if (hparams.n_layer == 12 && hparams.n_embd == 768) {
model.type = e_model::MODEL_137M; model.type = e_model::MODEL_137M;
@ -4934,7 +4935,7 @@ struct llm_build_context {
const int32_t n_orig_ctx; const int32_t n_orig_ctx;
const bool do_rope_shift; const bool do_rope_shift;
const bool do_pooling; const uint32_t pooling_type;
const llm_build_cb & cb; const llm_build_cb & cb;
@ -4978,7 +4979,7 @@ struct llm_build_context {
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
n_orig_ctx (cparams.n_yarn_orig_ctx), n_orig_ctx (cparams.n_yarn_orig_ctx),
do_rope_shift (worst_case || kv_self.has_shift), do_rope_shift (worst_case || kv_self.has_shift),
do_pooling (hparams.pooling_layer && cparams.do_pooling), pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
cb (cb), cb (cb),
buf_compute_meta (lctx.buf_compute_meta) { buf_compute_meta (lctx.buf_compute_meta) {
// all initializations should be done in init() // all initializations should be done in init()
@ -5835,7 +5836,8 @@ struct llm_build_context {
// get input vectors with right size // get input vectors with right size
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0); struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
// construct input embeddings (token, type, position) // construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@ -5952,8 +5954,12 @@ struct llm_build_context {
cur = inpL; cur = inpL;
// pooling layer // pooling layer
if (do_pooling) { if (pooling_type == LLAMA_POOLING_MEAN) {
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum); cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
} else if (pooling_type == LLAMA_POOLING_CLS) {
cur = ggml_get_rows(ctx0, cur, inp_cls);
} else {
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
} }
cb(cur, "result_embd", -1); cb(cur, "result_embd", -1);
@ -7501,15 +7507,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
} }
} }
{
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
float * data = (float *) lctx.inp_sum->data;
for (int i = 0; i < batch.n_tokens; ++i) {
data[i] = 1.0f/float(batch.n_tokens);
}
}
if (kv_self.has_shift) { if (kv_self.has_shift) {
const int64_t n_ctx = cparams.n_ctx; const int64_t n_ctx = cparams.n_ctx;
@ -7522,17 +7519,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
} }
} }
if (hparams.pooling_layer && cparams.do_pooling) { if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
const int64_t n_tokens = batch.n_tokens; const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_sum->data; float * data = (float *) lctx.inp_mean->data;
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum)); memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
sum[seq_id] += 1;
}
std::vector<float> div(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
}
}
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0]; const llama_seq_id seq_id = batch.seq_id[i][0];
data[seq_id*n_tokens + i] = 1.0f; data[seq_id*n_tokens + i] = div[seq_id];
}
}
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
if (pos == 0) {
data[seq_id] = i;
}
} }
} }
} }
@ -11417,14 +11443,16 @@ struct llama_context * llama_new_context_with_model(
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_tokens, "inp_tokens");
ggml_set_name(ctx->inp_embd, "inp_embd"); ggml_set_name(ctx->inp_embd, "inp_embd");
ggml_set_name(ctx->inp_pos, "inp_pos"); ggml_set_name(ctx->inp_pos, "inp_pos");
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
ggml_set_name(ctx->inp_sum, "inp_sum"); ggml_set_name(ctx->inp_mean, "inp_mean");
ggml_set_name(ctx->inp_cls, "inp_cls");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));

View File

@ -112,6 +112,12 @@ extern "C" {
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
}; };
enum llama_pooling_type {
LLAMA_POOLING_NONE = 0,
LLAMA_POOLING_MEAN = 1,
LLAMA_POOLING_CLS = 2,
};
enum llama_split_mode { enum llama_split_mode {
LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_NONE = 0, // single GPU
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs