mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 13:28:50 +01:00
metal : reusing llama.cpp logging (#3152)
* metal : reusing llama.cpp logging * cmake : build fix * metal : logging callback * metal : logging va_args memory fix * metal : minor cleanup * metal : setting function like logging macro to capital letters * llama.cpp : trailing whitespace fix * ggml : log level enum used by llama * Makefile : cleanup ggml-metal recipe * ggml : ggml_log_callback typedef * ggml : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
527e57cfd8
commit
dc6897404e
@ -903,7 +903,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
|
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
||||||
(void) level;
|
(void) level;
|
||||||
(void) text;
|
(void) text;
|
||||||
(void) user_data;
|
(void) user_data;
|
||||||
|
@ -19,6 +19,8 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
@ -33,6 +35,8 @@ struct ggml_cgraph;
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
struct ggml_metal_context;
|
struct ggml_metal_context;
|
||||||
|
|
||||||
// number of command buffers to use
|
// number of command buffers to use
|
||||||
|
116
ggml-metal.m
116
ggml-metal.m
@ -11,11 +11,14 @@
|
|||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
// TODO: temporary - reuse llama.cpp logging
|
|
||||||
#ifdef GGML_METAL_NDEBUG
|
#ifdef GGML_METAL_NDEBUG
|
||||||
#define metal_printf(...)
|
#define GGML_METAL_LOG_INFO(...)
|
||||||
|
#define GGML_METAL_LOG_WARN(...)
|
||||||
|
#define GGML_METAL_LOG_ERROR(...)
|
||||||
#else
|
#else
|
||||||
#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
|
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||||
|
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||||
|
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
@ -120,8 +123,37 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
|
ggml_log_callback ggml_metal_log_callback = NULL;
|
||||||
|
void * ggml_metal_log_user_data = NULL;
|
||||||
|
|
||||||
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
|
ggml_metal_log_callback = log_callback;
|
||||||
|
ggml_metal_log_user_data = user_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
|
||||||
|
if (ggml_metal_log_callback != NULL) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, format);
|
||||||
|
char buffer[128];
|
||||||
|
int len = vsnprintf(buffer, 128, format, args);
|
||||||
|
if (len < 128) {
|
||||||
|
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
||||||
|
} else {
|
||||||
|
char* buffer2 = malloc(len+1);
|
||||||
|
vsnprintf(buffer2, len+1, format, args);
|
||||||
|
buffer2[len] = 0;
|
||||||
|
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
||||||
|
free(buffer2);
|
||||||
|
}
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
metal_printf("%s: allocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||||
|
|
||||||
id <MTLDevice> device;
|
id <MTLDevice> device;
|
||||||
NSString * s;
|
NSString * s;
|
||||||
@ -131,14 +163,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
NSArray * devices = MTLCopyAllDevices();
|
NSArray * devices = MTLCopyAllDevices();
|
||||||
for (device in devices) {
|
for (device in devices) {
|
||||||
s = [device name];
|
s = [device name];
|
||||||
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Pick and show default Metal device
|
// Pick and show default Metal device
|
||||||
device = MTLCreateSystemDefaultDevice();
|
device = MTLCreateSystemDefaultDevice();
|
||||||
s = [device name];
|
s = [device name];
|
||||||
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||||
|
|
||||||
// Configure context
|
// Configure context
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
@ -165,7 +197,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -179,11 +211,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
||||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||||
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||||
|
|
||||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,7 +227,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||||
#endif
|
#endif
|
||||||
if (error) {
|
if (error) {
|
||||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -207,11 +239,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
#define GGML_METAL_ADD_KERNEL(name) \
|
#define GGML_METAL_ADD_KERNEL(name) \
|
||||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||||
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,13 +302,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
#if TARGET_OS_OSX
|
#if TARGET_OS_OSX
|
||||||
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
} else {
|
} else {
|
||||||
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -284,7 +316,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
metal_printf("%s: deallocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||||
#define GGML_METAL_DEL_KERNEL(name) \
|
#define GGML_METAL_DEL_KERNEL(name) \
|
||||||
[ctx->function_##name release]; \
|
[ctx->function_##name release]; \
|
||||||
[ctx->pipeline_##name release];
|
[ctx->pipeline_##name release];
|
||||||
@ -360,7 +392,7 @@ void * ggml_metal_host_malloc(size_t n) {
|
|||||||
void * data = NULL;
|
void * data = NULL;
|
||||||
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -388,7 +420,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
//
|
//
|
||||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||||
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
@ -400,13 +432,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||||
|
|
||||||
return ctx->buffers[i].metal;
|
return ctx->buffers[i].metal;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("%s: error: buffer is nil\n", __func__);
|
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
|
||||||
|
|
||||||
return nil;
|
return nil;
|
||||||
}
|
}
|
||||||
@ -418,7 +450,7 @@ bool ggml_metal_add_buffer(
|
|||||||
size_t size,
|
size_t size,
|
||||||
size_t max_size) {
|
size_t max_size) {
|
||||||
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
||||||
metal_printf("%s: too many buffers\n", __func__);
|
GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -428,7 +460,7 @@ bool ggml_metal_add_buffer(
|
|||||||
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
|
||||||
|
|
||||||
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
||||||
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -449,11 +481,11 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
@ -473,13 +505,13 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
if (i + size_step < size) {
|
if (i + size_step < size) {
|
||||||
metal_printf("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
@ -487,17 +519,17 @@ bool ggml_metal_add_buffer(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if TARGET_OS_OSX
|
#if TARGET_OS_OSX
|
||||||
metal_printf(", (%8.2f / %8.2f)",
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||||
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
|
GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
metal_printf("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -610,7 +642,7 @@ void ggml_metal_graph_find_concurrency(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
||||||
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -664,7 +696,7 @@ void ggml_metal_graph_compute(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
//GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||||
@ -708,17 +740,17 @@ void ggml_metal_graph_compute(
|
|||||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
||||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
||||||
|
|
||||||
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
//if (src0) {
|
//if (src0) {
|
||||||
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
// GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
||||||
// ggml_is_contiguous(src0), src0->name);
|
// ggml_is_contiguous(src0), src0->name);
|
||||||
//}
|
//}
|
||||||
//if (src1) {
|
//if (src1) {
|
||||||
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
// GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
||||||
// ggml_is_contiguous(src1), src1->name);
|
// ggml_is_contiguous(src1), src1->name);
|
||||||
//}
|
//}
|
||||||
//if (dst) {
|
//if (dst) {
|
||||||
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
// GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
||||||
// dst->name);
|
// dst->name);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
@ -830,7 +862,7 @@ void ggml_metal_graph_compute(
|
|||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -1019,7 +1051,7 @@ void ggml_metal_graph_compute(
|
|||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
metal_printf("Asserting on type %d\n",(int)src0t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||||
GGML_ASSERT(false && "not implemented");
|
GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1286,7 +1318,7 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
7
ggml.h
7
ggml.h
@ -445,6 +445,12 @@ extern "C" {
|
|||||||
GGML_OBJECT_WORK_BUFFER
|
GGML_OBJECT_WORK_BUFFER
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ggml_log_level {
|
||||||
|
GGML_LOG_LEVEL_ERROR = 2,
|
||||||
|
GGML_LOG_LEVEL_WARN = 3,
|
||||||
|
GGML_LOG_LEVEL_INFO = 4
|
||||||
|
};
|
||||||
|
|
||||||
// ggml object
|
// ggml object
|
||||||
struct ggml_object {
|
struct ggml_object {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
@ -1691,6 +1697,7 @@ extern "C" {
|
|||||||
};
|
};
|
||||||
|
|
||||||
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
||||||
|
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
// optimization parameters
|
// optimization parameters
|
||||||
//
|
//
|
||||||
|
21
llama.cpp
21
llama.cpp
@ -92,12 +92,12 @@
|
|||||||
//
|
//
|
||||||
|
|
||||||
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
||||||
static void llama_log_internal (llama_log_level level, const char* format, ...);
|
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
||||||
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
|
||||||
//
|
//
|
||||||
// helpers
|
// helpers
|
||||||
@ -904,7 +904,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|||||||
|
|
||||||
struct llama_state {
|
struct llama_state {
|
||||||
// We save the log callback globally
|
// We save the log callback globally
|
||||||
llama_log_callback log_callback = llama_log_callback_default;
|
ggml_log_callback log_callback = llama_log_callback_default;
|
||||||
void * log_callback_user_data = nullptr;
|
void * log_callback_user_data = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -6366,6 +6366,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
||||||
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
||||||
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
||||||
}
|
}
|
||||||
@ -7199,12 +7200,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|||||||
return ctx->model.tensors_by_name;
|
return ctx->model.tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
g_state.log_callback_user_data = user_data;
|
g_state.log_callback_user_data = user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
||||||
va_list args_copy;
|
va_list args_copy;
|
||||||
va_copy(args_copy, args);
|
va_copy(args_copy, args);
|
||||||
char buffer[128];
|
char buffer[128];
|
||||||
@ -7221,14 +7222,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|||||||
va_end(args_copy);
|
va_end(args_copy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_internal(llama_log_level level, const char * format, ...) {
|
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, format);
|
va_start(args, format);
|
||||||
llama_log_internal_v(level, format, args);
|
llama_log_internal_v(level, format, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
||||||
(void) level;
|
(void) level;
|
||||||
(void) user_data;
|
(void) user_data;
|
||||||
fputs(text, stderr);
|
fputs(text, stderr);
|
||||||
|
15
llama.h
15
llama.h
@ -62,12 +62,6 @@ extern "C" {
|
|||||||
|
|
||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
|
||||||
enum llama_log_level {
|
|
||||||
LLAMA_LOG_LEVEL_ERROR = 2,
|
|
||||||
LLAMA_LOG_LEVEL_WARN = 3,
|
|
||||||
LLAMA_LOG_LEVEL_INFO = 4
|
|
||||||
};
|
|
||||||
|
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||||
@ -151,13 +145,6 @@ extern "C" {
|
|||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
};
|
};
|
||||||
|
|
||||||
// Signature for logging events
|
|
||||||
// Note that text includes the new line character at the end for most events.
|
|
||||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
|
||||||
// if it exists.
|
|
||||||
// It might not exist for progress report where '.' is output repeatedly.
|
|
||||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params {
|
||||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
@ -526,7 +513,7 @@ extern "C" {
|
|||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user