mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 06:39:25 +01:00
server : fix default draft model parameters (#10586)
* server : force F16 KV cache for the draft model ggml-ci * server : fix draft params ggml-ci * server : various params fixes ggml-ci
This commit is contained in:
parent
642330ac7c
commit
70b98fadbc
@ -696,8 +696,9 @@ struct server_context {
|
|||||||
|
|
||||||
params_dft.devices = params_base.speculative.devices;
|
params_dft.devices = params_base.speculative.devices;
|
||||||
params_dft.model = params_base.speculative.model;
|
params_dft.model = params_base.speculative.model;
|
||||||
params_dft.n_ctx = params_base.speculative.n_ctx;
|
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||||
|
params_dft.n_parallel = 1;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params_dft);
|
common_init_result llama_init_dft = common_init_from_params(params_dft);
|
||||||
|
|
||||||
@ -717,8 +718,14 @@ struct server_context {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams_dft = common_context_params_to_llama(params_base);
|
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
|
||||||
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
|
||||||
|
cparams_dft = common_context_params_to_llama(params_dft);
|
||||||
|
cparams_dft.n_batch = n_ctx_dft;
|
||||||
|
|
||||||
|
// force F16 KV cache for the draft model for extra performance
|
||||||
|
cparams_dft.type_k = GGML_TYPE_F16;
|
||||||
|
cparams_dft.type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
// the context is not needed - we will create one for each slot
|
// the context is not needed - we will create one for each slot
|
||||||
llama_free(llama_init_dft.context);
|
llama_free(llama_init_dft.context);
|
||||||
@ -2322,6 +2329,10 @@ struct server_context {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.state != SLOT_STATE_GENERATING) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token id = slot.sampled;
|
llama_token id = slot.sampled;
|
||||||
|
|
||||||
struct common_speculative_params params_spec;
|
struct common_speculative_params params_spec;
|
||||||
|
Loading…
Reference in New Issue
Block a user