mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 06:10:29 +01:00
Test-based VRAM scratch size + context adjustment (#2056)
This commit is contained in:
parent
b213227067
commit
befb3a3562
38
llama.cpp
38
llama.cpp
@ -66,6 +66,7 @@ enum e_model {
|
|||||||
MODEL_65B,
|
MODEL_65B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const size_t kB = 1024;
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB = 1024*1024;
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// amount of VRAM needed per batch size to hold temporary results
|
||||||
|
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
||||||
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||||
|
{
|
||||||
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 512ull * kB },
|
||||||
|
{ MODEL_7B, 512ull * kB },
|
||||||
|
{ MODEL_13B, 640ull * kB },
|
||||||
|
{ MODEL_30B, 768ull * kB },
|
||||||
|
{ MODEL_65B, 1536ull * kB },
|
||||||
|
};
|
||||||
|
return k_sizes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// amount of VRAM needed per batch size and context to hold temporary results
|
||||||
|
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
||||||
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||||
|
{
|
||||||
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 128ull },
|
||||||
|
{ MODEL_7B, 128ull },
|
||||||
|
{ MODEL_13B, 160ull },
|
||||||
|
{ MODEL_30B, 208ull },
|
||||||
|
{ MODEL_65B, 416ull },
|
||||||
|
};
|
||||||
|
return k_sizes;
|
||||||
|
}
|
||||||
|
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
@ -1118,11 +1147,14 @@ static void llama_model_load_internal(
|
|||||||
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
ggml_cuda_set_scratch_size(0); // disable scratch
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
||||||
} else {
|
} else {
|
||||||
vram_scratch = n_batch * MB;
|
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
||||||
|
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
||||||
|
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
||||||
__func__, vram_scratch / MB);
|
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
||||||
|
(vram_scratch + MB - 1) / MB); // round up
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
Loading…
Reference in New Issue
Block a user