mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 22:08:46 +01:00
Fixed OpenCL offloading prints (#2082)
This commit is contained in:
parent
7f0e9a775e
commit
9e4475f5cf
15
llama.cpp
15
llama.cpp
@ -1156,6 +1156,7 @@ static void llama_model_load_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
@ -1164,6 +1165,10 @@ static void llama_model_load_internal(
|
|||||||
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
||||||
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
@ -1180,14 +1185,18 @@ static void llama_model_load_internal(
|
|||||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||||
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
|
Loading…
Reference in New Issue
Block a user