From 6b83a3e16fa3126d2c5e6667d2f396bde84a68b4 Mon Sep 17 00:00:00 2001 From: mqy Date: Fri, 16 Jun 2023 20:32:12 +0800 Subject: [PATCH] try make CL run w/o tunning, but -ngl stucks no output. had to add task runer and profile id, many changes, see the f codes --- examples/mulmat-tune/mulmat-tune.cpp | 27 +- ggml-opencl.cpp | 2 +- ggml-threading.c | 37 ++- ggml-threading.h | 22 +- ggml-tune.c | 70 ++++- ggml-tune.h | 13 +- ggml.c | 445 ++++++++++++++------------- ggml.h | 29 ++ tests/test-ggml-threading.c | 61 +++- tests/test-ggml-tune.c | 10 +- 10 files changed, 435 insertions(+), 281 deletions(-) diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index 55dd19275..da1d0a1c1 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -11,6 +11,10 @@ #define UNUSED(x) (void)(x) +// F16 has an pending Illegal Instruction error on macos-latest-cmake. +// So the workaround is to disable non-quantized ftypes. +// #define SUPPORT_NONE_Q_TYPE 1 + static void print_build_tips(void) { const char *a = "LLAMA_NO_ACCELERATE"; fprintf(stderr, "Tips on how to build with various backend vendors:\n\n"); @@ -62,11 +66,12 @@ static void usage(char *prog) { "--model MODEL 3B | 7B | 13B | 30B | 65B", " default 7B", "--ftype FTYPE ggml ftype:", +#ifdef SUPPORT_NONE_Q_TYPE " 0: all F32", " 1: mostly F16", +#endif " 2: mostly Q4_0", " 3: mostly Q4_1", - " 4: mostly Q4_1, some F16", " 7: mostly Q8_0", " 8: mostly Q5_0", " 9: mostly Q5_1", @@ -84,7 +89,7 @@ static void usage(char *prog) { " requires: between [1, 3]", "--n_threads NTH bench with this number of threads", " requires: between [1, 16]", - " default 1", + " default 4", "--file FILE data file to write", " default stdout", "-y always answer \"yes\" to all prompts", @@ -170,8 +175,22 @@ int main(int argc, char **argv) { ftype = (enum ggml_ftype)v; } +#ifndef SUPPORT_NONE_Q_TYPE if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) { - fprintf(stderr, "none quantized type %d is not supported\n", ftype); + fprintf(stderr, "error: none quantized type %d is not supported\n", + ftype); + return 1; + } +#endif + + bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 && + ftype <= GGML_FTYPE_MOSTLY_Q4_1; + bool cond_2 = + ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K; + + if (!(cond_1 || cond_2)) { + fprintf(stderr, "error: type %d is not a known ggml ftype.\n", + ftype); return 1; } } @@ -223,7 +242,7 @@ int main(int argc, char **argv) { } } - int n_threads = 1; + int n_threads = 4; { if (arg_n_threads != NULL) { int v = atoi(arg_n_threads); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index c9151a8e4..2a1a04fca 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_ } void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); if (src0->type == GGML_TYPE_F32) { ggml_cl_mul_mat_f32(src0, src1, dst); diff --git a/ggml-threading.c b/ggml-threading.c index 6dd6d2817..7ef763c0f 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -170,7 +170,8 @@ struct ggml_compute_state_shared { atomic_bool wait_on_done; atomic_bool stop; - ggml_threading_task_runner *task_runner; + // Default task runner, can be overriden by node.task_profile.runner. + ggml_task_runner *task_runner; struct ggml_threading_context *ctx; }; @@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { } if (shared->n_tasks > 0 && state->has_work) { - enum ggml_compute_error err = - shared->task_runner(&state->params, state->node); + ggml_task_runner *runner = state->node->task_profile.runner + ? state->node->task_profile.runner + : shared->task_runner; + enum ggml_compute_error err = runner(&state->params, state->node); GGML_ASSERT(err == GGML_COMPUTE_OK); @@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, size_t wsize) { GGML_ASSERT(ctx); GGML_ASSERT(node); - GGML_ASSERT(ctx->shared.task_runner); + + ggml_task_runner *runner = ctx->shared.task_runner; + if (node->task_profile.runner) { + runner = node->task_profile.runner; + } + struct ggml_compute_state_shared *state_shared = &ctx->shared; // This is the params for main thread. @@ -491,7 +499,7 @@ START: params.wsize = wsize; params.wdata = wdata; - err = state_shared->task_runner(¶ms, node); + err = runner(¶ms, node); } // wait for tasks done. @@ -509,11 +517,21 @@ START: if (err != GGML_COMPUTE_OK) { if (err == GGML_COMPUTE_FALLBACK) { + PRINT_DEBUG("[main] fallback from profile, id=%d\n", + node->task_profile.id); + GGML_ASSERT(node->task_profile.stages[1].backend > + GGML_TASK_BACKEND_CPU); + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int n = ggml_get_task_profiles(node, profiles); GGML_ASSERT(n > 0); + GGML_ASSERT(profiles[0].stages[1].backend == + GGML_TASK_BACKEND_CPU); + memcpy(&node->task_profile, &profiles[0], sizeof(struct ggml_task_profile)); + runner = ctx->shared.task_runner; + goto START; } return err; @@ -525,12 +543,13 @@ START: struct ggml_threading_context * ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, - ggml_threading_task_runner *task_stage_runner, + ggml_task_runner *task_runner, enum ggml_threading_features features, int64_t stages_time[3]) { GGML_ASSERT(n_threads > 0); - GGML_ASSERT(thread_runner); - GGML_ASSERT(task_stage_runner); + if (thread_runner == NULL) { + thread_runner = ggml_threading_graph_compute_thread; + } size_t ctx_sz = sizeof(struct ggml_threading_context); struct ggml_threading_context *ctx = malloc(ctx_sz); @@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, .wait_now = false, .wait_on_done = false, .stop = false, - .task_runner = task_stage_runner, + .task_runner = task_runner, .ctx = ctx, }; diff --git a/ggml-threading.h b/ggml-threading.h index f3214efc7..189fc2ed5 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -21,27 +21,21 @@ enum ggml_threading_features { GGML_THREADING_FEATURE_PERF = 1 << 1, }; -// Compute errors. -enum ggml_compute_error { - GGML_COMPUTE_OK = 0, - GGML_COMPUTE_FALLBACK = 1, -}; - -// The task runner to be called by main thread and workers. -typedef enum ggml_compute_error(ggml_threading_task_runner)( - struct ggml_compute_params *params, struct ggml_tensor *node); - // The thread runner to feed into OS threads. typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); // Init and start underlying workers if n_threads > 1. // -// features: optional for configure threading additional features. -// see `ggml_threading_feature`, default 0. +// thread: optional OS thread runner, default value: +// `ggml_threading_graph_compute_thread`. +// +// features: optional for configure +// threading additional features. see `ggml_threading_feature`, default 0. +// // stages_time: optional for collecting per-stage wall clock time. struct ggml_threading_context * ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread, - ggml_threading_task_runner *task_stage_runner, + ggml_task_runner *task_runner, enum ggml_threading_features features, int64_t stages_time[3]); @@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, // This is an experimental functionality for mulmat tune, as a thin wrapper. enum ggml_compute_error -ggml_compute_forward_wrapper(struct ggml_compute_params *params, +ggml_compute_forward_wrapper(const struct ggml_compute_params *params, struct ggml_tensor *tensor); #ifdef __cplusplus diff --git a/ggml-tune.c b/ggml-tune.c index 20f395069..aeb63e957 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { } } -const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( - struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t, - enum ggml_type src1_t, int stages_time[3]) { +// NOTE: we can not use the profile from tune because the profiles do not +// contain fields such as runner, get_size. +int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, + int stages_time[3]) { GGML_ASSERT(tune); // TODO: default_mm_cache is thread-unsafe. @@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( names[i] = ggml_mulmat_tune_task_backend_name( prof->stages[i].backend); } - printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the " - "fastest profile: %s %s %s\n", - M, N, K, names[0], names[1], names[2]); + printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " + "backends: %s %s %s\n", + M, N, K, prof->id, names[0], names[1], names[2]); #endif } } } - return prof; + return prof->id; } void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, @@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) { if (shape->m_num > 0) { if (shape->arr_m) { free(shape->arr_m); + shape->arr_m = NULL; } if (shape->items) { free(shape->items); + shape->items = NULL; } + shape->m_num = 0; } } } @@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles( int rc; for (int i = 0; i < n_profiles; i++) { const struct ggml_task_profile *profile = &profiles[i]; + rc = fprintf(fp, "%d ", profile->id); + if (rc <= 0) { + return false; + } + for (int j = 0; j < 3; j++) { const struct ggml_task_stage *ts = &profile->stages[j]; rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, @@ -304,7 +315,6 @@ static bool ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, const char *model, int ftype, int n_threads, char *errbuf, int errbuf_len) { - if (tune->version != GGML_MULMAT_TUNE_VERSION) { snprintf(errbuf, errbuf_len - 1, "version mismatch, built-in: %d, " @@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, int n_profiles = ggml_get_task_profiles(&node, builtin_profiles); if (n_profiles != shape->n_profiles) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)"); + snprintf(errbuf, errbuf_len - 1, + "task profiles mismatch (n_profiles)"); return false; } // TODO: profiles order is relevant, too strict. - size_t sz = sizeof(struct ggml_task_profile) * n_profiles; - if (memcmp(builtin_profiles, shape->profiles, sz) != 0) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)"); + // Only validate stages! + size_t sz = sizeof(struct ggml_task_stage) * 3; + bool matched = true; + for (int j = 0; j < n_profiles; j++) { + if (builtin_profiles[j].id != shape->profiles[j].id) { + return false; + } + if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages, + sz) != 0) { + matched = false; + break; + } + } + if (!matched) { + snprintf(errbuf, errbuf_len - 1, + "task profiles mismatch (profiles)"); printf("=== built-in profiles:\n"); ggml_mulmat_tune_write_profiles(stderr, builtin_profiles, @@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; + + rc = fscanf(fp, "%d ", &profile->id); + if (rc <= 0) { + return false; + } + for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; int backend; @@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, GGML_ASSERT(params); GGML_ASSERT(params->model.name); + memset(tune, 0, sizeof(struct ggml_mulmat_tune)); + enum ggml_task_backend backends[16]; int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { @@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, return false; } + if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K && + params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) { +#if defined(GGML_USE_CLBLAST) + printf("[tune] error: cl implementation does not support k_quants at " + "the time of writing this code, skip.\n"); + return false; +#endif + } + bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; @@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, int64_t t0 = ggml_time_ms(); struct ggml_threading_context *thrd_ctx = ggml_threading_start( - tune->n_threads, ggml_threading_graph_compute_thread, - ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE, - stages_time); + tune->n_threads, NULL, ggml_compute_forward_wrapper, + GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; diff --git a/ggml-tune.h b/ggml-tune.h index b12466155..7955a50a9 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -10,7 +10,7 @@ extern "C" { #endif -#define GGML_MULMAT_TUNE_VERSION 8 +#define GGML_MULMAT_TUNE_VERSION 9 #define GGML_MULMAT_N_SHAPES 4 #define GGML_MULMAT_CACHE_LEN 16 @@ -55,7 +55,7 @@ struct ggml_mulmat_tune_shape { struct ggml_mulmat_tune_m *items; }; - struct ggml_mulmat_tune_cache_ele { +struct ggml_mulmat_tune_cache_ele { int M; int N; int K; @@ -98,10 +98,11 @@ struct ggml_mulmat_tune_params { }; // NOTE: stages_time is filled if not null. -const struct ggml_task_profile * -ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, - int N, int K, enum ggml_type src0_t, - enum ggml_type src1_t, int stages_time[3]); +// Return profile id. +int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, + int stages_time[3]); bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, const char *model_name, int ftype, diff --git a/ggml.c b/ggml.c index b734f1a0c..43ec93a64 100644 --- a/ggml.c +++ b/ggml.c @@ -8500,19 +8500,14 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); - } - return; + if (src1->backend == GGML_BACKEND_GPU) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); } -#else - GGML_ASSERT(false); + return; + } #endif - }; const int64_t nr = ggml_nrows(src0); @@ -9938,7 +9933,7 @@ static void ggml_compute_forward_rms_norm_back( } } - +// CPU only static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10010,18 +10005,6 @@ static void ggml_compute_forward_mul_mat_f32( // compute by src0 rows enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -10104,6 +10087,7 @@ static void ggml_compute_forward_mul_mat_f32( //} } +// CPU only. static void ggml_compute_forward_mul_mat_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10168,19 +10152,9 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -10304,6 +10278,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } +// CPU only static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10373,20 +10348,8 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -14294,30 +14257,9 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); - enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { -#if defined(GGML_USE_CUBLAS) - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return GGML_COMPUTE_OK; - } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); - return GGML_COMPUTE_FALLBACK; -#else - GGML_ASSERT(false); -#endif - } - - // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) { - // printf("mulmat: test fallback\n"); - // return GGML_COMPUTE_FALLBACK; - // } - switch (tensor->op) { case GGML_OP_DUP: { @@ -14568,13 +14510,6 @@ static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * return GGML_COMPUTE_OK; } -enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params, - struct ggml_tensor *tensor) { - // We call ggml_compute_forward because the CUDA mul_mat entry point - // was moved out of `ggml_compute_forward_mul_mat`. - return ggml_compute_forward(params, tensor); -} - //////////////////////////////////////////////////////////////////////////////// static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { @@ -15524,12 +15459,67 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // ---- task profiles ---- +// TODO: replace with ggml_compute_forward_cuda +// DO NOT check matrix size further. +#if defined(GGML_USE_CUBLAS) +static enum ggml_compute_error ggml_compute_forward_cuda( + const struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)); + if (ggml_cuda_compute_forward(params, tensor)) { + return GGML_COMPUTE_OK; + } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; +} +#endif + +// TODO: replace with ggml_cl_mul_mat. +// DO NOT check matrix size further. +#if defined(GGML_USE_CLBLAST) +static enum ggml_compute_error ggml_compute_forward_cl( + const struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor)); + ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize); + return GGML_COMPUTE_OK; + default: + break; + } + + GGML_ASSERT(false); +} + +static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); + default: + break; + } + return -1; +} +#endif + +// The wrapper for external mulmat tune tool. +enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + // We call ggml_compute_forward because the CUDA mul_mat entry point + // was moved out of `ggml_compute_forward_mul_mat`. + return ggml_compute_forward(params, tensor); +} + // Implement `ggml_task_profiles_provider`. // Fill `profiles` for the `node` and return number of profiles. // // NOTE: the node may be incompleted from testing or tunning, so please assert // everything used here. -inline int ggml_get_task_profiles( +// +// TODO: configure cuda for none mul_mat nodes. +int ggml_get_task_profiles( struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { GGML_ASSERT(node); @@ -15595,6 +15585,8 @@ inline int ggml_get_task_profiles( } break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { + // CPU only profiles. + // CUDA/CL: see end of function. GGML_ASSERT(node->src0); GGML_ASSERT(node->src1); @@ -15614,16 +15606,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } else if (src0_t == GGML_TYPE_F16) { p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; @@ -15635,16 +15617,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } else if (ggml_is_quantized(src0_t)) { p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; @@ -15658,16 +15630,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } n_profiles = i; } break; @@ -15757,7 +15719,43 @@ inline int ggml_get_task_profiles( GGML_ASSERT(false); } +#if defined(GGML_USE_CUBLAS) + switch (node->op) { + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_SILU: + case GGML_OP_RMS_NORM: + case GGML_OP_MUL_MAT: + case GGML_OP_RESHAPE: + case GGML_OP_ROPE: { + int i = n_profiles; + p[i].runner = ggml_compute_forward_cuda; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + ++n_profiles; + } break; + default: { + } break; + } +#elif defined(GGML_USE_CLBLAST) + switch (node->op) { + case GGML_OP_MUL_MAT: { + int i = n_profiles; + p[i].runner = ggml_compute_forward_cl; + p[i].get_wsize = ggml_compute_forward_get_wsize_cl; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + ++n_profiles; + } break; + default: { + } break; + } +#endif + GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); + for (int i = 0; i < n_profiles; i++) { + profiles[i].id = i + 1; + } return n_profiles; } @@ -15769,7 +15767,7 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( GGML_ASSERT(node); GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); GGML_ASSERT(profiles); - GGML_ASSERT(n_profiles >= 2); + GGML_ASSERT(n_profiles > 0); enum ggml_type src0_t = node->src0->type; enum ggml_type src1_t = node->src1->type; @@ -15777,9 +15775,9 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( // Type and memory layout requirements for computing mul_mat with BLAS. bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); int M = (int)node->ne[1]; int N = (int)node->ne[0]; @@ -15790,10 +15788,14 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( if (cond_match) { #if defined(GGML_USE_TUNE) if (tune != NULL) { - prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + GGML_ASSERT(n_profiles >= 2); + int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); - if (prof != NULL) { - return prof; + for (int i = 0; i < n_profiles; i++) { + if (profiles[i].id == id) { + prof = &profiles[i]; + return prof; + } } } #else @@ -15841,11 +15843,101 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( return prof; } +void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node, + struct ggml_cgraph *cgraph) { + // Pre-specified. + for (int i = 0; i < 3; i++) { + if (node->task_profile.stages[i].backend > 0) { + return; + } + } + + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(node, profiles); + + const struct ggml_task_profile *profile = NULL; + + // GPU offloading. A special case of pre-specified task_profile. + if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) { + if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) { + enum ggml_task_backend be; + if (ggml_cpu_has_cublas()) { + be = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + be = GGML_TASK_BACKEND_GPU_CL; + } else { + GGML_ASSERT(false); + } + + for (int j = 0; j < n_profiles; j++) { + if (profiles[j].stages[1].backend == be) { + profile = &profiles[j]; + break; + } + } + GGML_ASSERT(profile); + GGML_ASSERT(!cgraph->tune); + + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; + } + } + + // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD. + if (node->op == GGML_OP_MUL_MAT) { +#if defined(GGML_USE_TUNE) + GGML_ASSERT(node->backend == GGML_BACKEND_CPU); + + int stages_time_us[3]; + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, + cgraph->tune, stages_time_us); + GGML_ASSERT(profile); + + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + + if (cgraph->tune) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j < 3; j++) { + if (node->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + node->task_profile.stages[j].wait = false; + } + } + } + } + return; +#else + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, + NULL); + GGML_ASSERT(profile); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; +#endif + } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct? + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, + NULL); + GGML_ASSERT(profile); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; + } + + // default. + profile = &profiles[0]; + GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); +} + void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; struct ggml_threading_context *thrd_ctx = ggml_threading_start( - n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, + n_threads, NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); // initialize tasks + work buffer @@ -15854,107 +15946,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t work_size = 0; - struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; - // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) { - continue; - } - int n_profiles = ggml_get_task_profiles(node, profiles); - - const struct ggml_task_profile *profile = NULL; - - // Adapt node->backend: assume GPU at COMPUTE stage. - if (node->backend == GGML_BACKEND_GPU || - node->backend == GGML_BACKEND_GPU_SPLIT) { - enum ggml_task_backend be; - if (ggml_cpu_has_cublas()) { - be = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - be = GGML_TASK_BACKEND_GPU_CL; - } else { - GGML_ASSERT(false); - } - - for (int j = 0; j < n_profiles; j++) { - if (profiles[j].stages[1].backend == be) { - profile = &profiles[j]; - break; - } - } - GGML_ASSERT(profile); - } else { - GGML_ASSERT(node->backend == GGML_BACKEND_CPU); - } - - bool profile_copied = false; - - if (node->op == GGML_OP_MUL_MAT) { -#if defined(GGML_USE_TUNE) - int stages_time_us[3]; - profile = ggml_mulmat_get_task_profile( - node, profiles, n_profiles, cgraph->tune, stages_time_us); - GGML_ASSERT(profile); - - if (cgraph->tune) { - memcpy(&node->task_profile, profile, - sizeof(struct ggml_task_profile)); - profile_copied = true; - - // Do not wait if the estimated execution time is too small - // (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see - // ggml-threading.c - for (int j = 0; j< 3; j++) { - if (node->task_profile.stages[j].wait) { - if (stages_time_us[j] < 100) { - node->task_profile.stages[j].wait = false; - } - } - } - } -#else - profile = ggml_mulmat_get_task_profile(node, profiles, - n_profiles, NULL, NULL); - GGML_ASSERT(profile); -#endif - } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right? - profile = ggml_mulmat_get_task_profile(node, profiles, - n_profiles, NULL, NULL); - GGML_ASSERT(profile); - } else { - profile = &profiles[0]; - GGML_ASSERT(profile->stages[1].backend == - GGML_TASK_BACKEND_CPU); - } - - if (!profile_copied) { - memcpy(&node->task_profile, profile, - sizeof(struct ggml_task_profile)); - } + GGML_ASSERT (node->op != GGML_OP_NONE); struct ggml_task_stage *stages = node->task_profile.stages; - // Workrounnd to set node->backend. - for (int j = 0; j < 3; j++) { - if (node->backend == GGML_BACKEND_CPU && - (stages[j].backend & GGML_TASK_BACKEND_GPU)) { - if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) { - node->backend = GGML_BACKEND_GPU; - } else { - GGML_ASSERT(false); - } + ggml_graph_compute_set_tensor_task_proile(node, cgraph); + + // + // Allocate temp buffer `wdata` for CPU. + // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. + // + + if (node->task_profile.get_wsize) { + int sz = node->task_profile.get_wsize(node); + if (sz >= 0) { + work_size = MAX(work_size, (size_t)sz); + continue; } } + //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend); + // compute stage n_tasks. int n_tasks = stages[1].parallel ? n_threads : 1; - // Allocate temp buffer `wdata` for CPU. - // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: @@ -16012,20 +16031,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: // FIXME: is is right? + case GGML_OP_OUT_PROD: // FIXME: is this correct? { size_t cur = 0; enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node)); - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); -#else - GGML_ASSERT(false); -#endif - } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { GGML_ASSERT(ggml_cpu_has_cpublas()); GGML_ASSERT(node->src1->type == GGML_TYPE_F32); @@ -16039,11 +16050,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { GGML_ASSERT(false); } - } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - GGML_ASSERT(ggml_cpu_has_cublas()); - } - + } else { // CPU or GPU fallback GGML_ASSERT(node->src1->type == GGML_TYPE_F32); if (node->src0->type == GGML_TYPE_F32) { @@ -16056,8 +16063,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { GGML_ASSERT(false); } - } else { - GGML_ASSERT(false); } work_size = MAX(work_size, cur); diff --git a/ggml.h b/ggml.h index 5ab78c4a0..d4d5d3521 100644 --- a/ggml.h +++ b/ggml.h @@ -390,11 +390,40 @@ extern "C" { bool wait; }; + struct ggml_tensor; + struct ggml_compute_params; + + // Compute errors. + enum ggml_compute_error { + GGML_COMPUTE_OK = 0, + GGML_COMPUTE_FALLBACK = 1, + }; + + // The task runner to be called by main thread and workers. + typedef enum ggml_compute_error(ggml_task_runner)( + const struct ggml_compute_params *params, + struct ggml_tensor *node); + + // Get wsize for node computing. + // When return -1: should be explained as `fallback to CPU`, caller MUST + // determine how much memory to reserve for this node. + typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor); + // config for computing a tensor. struct ggml_task_profile { + // profile id, start from 1. + int id; + // index 0: INIT, 1: COMPUTE, 2: FINALIZE struct ggml_task_stage stages[3]; + // Optional task runner, overrides threading's task runner. + ggml_task_runner *runner; + + // Optional function to return required wsize for wdata. + ggml_task_get_wsize *get_wsize; + + // Optional flag for development. // MUST be used only in testing codes. uint8_t dev_flags[4]; }; diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 90d53e4cd..2079fe144 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -42,7 +42,8 @@ static const int n_repeat = 10; static int work_done_arr[MAX_N_THREADS]; static enum ggml_compute_error -mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) { +mock_task_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; if (node->task_profile.stages[params->type].parallel) { loops /= params->nth; @@ -79,9 +80,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int t0 = (int)ggml_time_us(); - struct ggml_threading_context *ctx = - ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, - mock_task_runner, features, /*stages_time*/ NULL); + struct ggml_threading_context *ctx = ggml_threading_start( + n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); @@ -141,7 +141,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { } static enum ggml_compute_error -mock_task_runner_fallback(struct ggml_compute_params *params, +mock_task_runner_fallback(const struct ggml_compute_params *params, struct ggml_tensor *node) { UNUSED(params); if (node->backend == GGML_BACKEND_GPU) { @@ -158,7 +158,7 @@ mock_task_runner_fallback(struct ggml_compute_params *params, // thus it is not parallelled. int test_fallback(struct ggml_tensor *node) { struct ggml_threading_context *ctx = ggml_threading_start( - 1, ggml_threading_graph_compute_thread, mock_task_runner_fallback, + 1, NULL, mock_task_runner_fallback, /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); enum ggml_compute_error err = @@ -177,6 +177,38 @@ int test_fallback(struct ggml_tensor *node) { return 0; } +static enum ggml_compute_error +customized_node_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { + UNUSED(params); + // Reset runner thus caller will know it was called. + node->task_profile.runner = NULL; + return GGML_COMPUTE_OK; +} + +// Test when node->task_profile.runner is not NULL. +int test_customized_node_runner(struct ggml_tensor *node) { + struct ggml_threading_context *ctx = ggml_threading_start( + 1, NULL, mock_task_runner, + /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); + + node->task_profile.runner = customized_node_runner; + enum ggml_compute_error err = + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + + ggml_threading_stop(ctx); + if (err != GGML_COMPUTE_OK) { + // should not happen. + abort(); + } + + if (node->task_profile.runner != NULL) { + return 2; + } + + return 0; +} + int main(void) { ggml_time_init(); @@ -367,7 +399,10 @@ int main(void) { } } + // fallback { + printf("[test-ggml-threading] test fallback ...\n"); + ++n_tests; // required by getting task profiles. @@ -382,9 +417,21 @@ int main(void) { node.src1 = &src1; node.backend = GGML_BACKEND_GPU; + stages[1].backend = GGML_TASK_BACKEND_GPU; if (test_fallback(&node) == 0) { ++n_passed; - printf("\n[test-ggml-threading] test fallback: ok\n\n"); + printf("[test-ggml-threading] test fallback: ok\n\n"); + } + } + + // customized node runner + { + printf("[test-ggml-threading] test customized node runner ...\n"); + ++n_tests; + + if (test_customized_node_runner(&node) == 0) { + ++n_passed; + printf("[test-ggml-threading] test customized node runner: ok\n\n"); } } diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 5499fa6bf..4339881e5 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -72,7 +72,9 @@ static int bench(void) { // GGML_FTYPE_ALL_F32, // GGML_FTYPE_MOSTLY_F16, GGML_FTYPE_MOSTLY_Q4_0, +#if defined(GGML_USE_K_QUANTS) GGML_FTYPE_MOSTLY_Q4_K, +#endif }; int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]); @@ -132,16 +134,14 @@ int estimate_time_non_zero_NK(void) { int time[3]; // 3 profiles. }; - struct ggml_mulmat_tune tune = { - .version = 1, - .ftype = GGML_FTYPE_MOSTLY_Q4_0, - }; + struct ggml_mulmat_tune tune; + enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0; const int m_num = 2; const int n_threads = 1; // useless. struct ggml_mulmat_tune_params params; - init_params(¶ms, tune.ftype, m_num, n_threads); + init_params(¶ms, ftype, m_num, n_threads); ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);