mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 20:43:07 +01:00
llama : fix compile warnings
This commit is contained in:
parent
17366df842
commit
2a4e41a086
22
ggml.c
22
ggml.c
@ -14720,12 +14720,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|||||||
const int64_t * ne = tensor->ne;
|
const int64_t * ne = tensor->ne;
|
||||||
const size_t * nb = tensor->nb;
|
const size_t * nb = tensor->nb;
|
||||||
|
|
||||||
fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n",
|
fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||||
ggml_type_name(tensor->type),
|
ggml_type_name(tensor->type),
|
||||||
ggml_op_name (tensor->op),
|
ggml_op_name (tensor->op),
|
||||||
tensor->n_dims,
|
tensor->n_dims,
|
||||||
ne[0], ne[1], ne[2], ne[3],
|
(int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
|
||||||
nb[0], nb[1], nb[2], nb[3],
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
tensor->data,
|
tensor->data,
|
||||||
tensor->name);
|
tensor->name);
|
||||||
}
|
}
|
||||||
@ -14734,13 +14734,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||||||
const int64_t * ne = tensor->ne;
|
const int64_t * ne = tensor->ne;
|
||||||
const size_t * nb = tensor->nb;
|
const size_t * nb = tensor->nb;
|
||||||
|
|
||||||
fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||||
arg,
|
arg,
|
||||||
ggml_type_name(tensor->type),
|
ggml_type_name(tensor->type),
|
||||||
ggml_op_name (tensor->op),
|
ggml_op_name (tensor->op),
|
||||||
tensor->n_dims,
|
tensor->n_dims,
|
||||||
ne[0], ne[1], ne[2], ne[3],
|
(int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
|
||||||
nb[0], nb[1], nb[2], nb[3],
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
tensor->n_tasks,
|
tensor->n_tasks,
|
||||||
tensor->data,
|
tensor->data,
|
||||||
tensor->name);
|
tensor->name);
|
||||||
@ -14763,11 +14763,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||||||
FILE * fout = stdout;
|
FILE * fout = stdout;
|
||||||
|
|
||||||
fprintf(fout, "\n");
|
fprintf(fout, "\n");
|
||||||
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||||
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||||
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||||
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||||
fprintf(fout, "%-16s %8ju\n", "eval", size_eval);
|
fprintf(fout, "%-16s %8d\n", "eval", (int) size_eval);
|
||||||
|
|
||||||
// header
|
// header
|
||||||
fprintf(fout, "\n");
|
fprintf(fout, "\n");
|
||||||
|
15
llama.cpp
15
llama.cpp
@ -1059,23 +1059,23 @@ static void llama_model_load_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void) main_gpu;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||||
ggml_cuda_set_main_device(main_gpu);
|
ggml_cuda_set_main_device(main_gpu);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||||
#else
|
#else
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// prepare memory for the weights
|
// prepare memory for the weights
|
||||||
size_t vram_weights = 0;
|
size_t vram_weights = 0;
|
||||||
size_t vram_scratch = 0;
|
|
||||||
{
|
{
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
@ -1152,10 +1152,8 @@ static void llama_model_load_internal(
|
|||||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
vram_scratch = n_batch * MB;
|
const size_t vram_scratch = n_batch * MB;
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
||||||
@ -1163,6 +1161,8 @@ static void llama_model_load_internal(
|
|||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||||
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
||||||
@ -1331,6 +1331,7 @@ static bool llama_eval_internal(
|
|||||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
(void) i_gpu_start;
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
offload_func_t offload_func = llama_nop;
|
offload_func_t offload_func = llama_nop;
|
||||||
|
Loading…
Reference in New Issue
Block a user