From 6b83a3e16fa3126d2c5e6667d2f396bde84a68b4 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Fri, 16 Jun 2023 20:32:12 +0800
Subject: [PATCH] try make CL run w/o tunning, but -ngl stucks no output. had
 to add task runer and profile id, many changes, see the f codes

---
 examples/mulmat-tune/mulmat-tune.cpp |  27 +-
 ggml-opencl.cpp                      |   2 +-
 ggml-threading.c                     |  37 ++-
 ggml-threading.h                     |  22 +-
 ggml-tune.c                          |  70 ++++-
 ggml-tune.h                          |  13 +-
 ggml.c                               | 445 ++++++++++++++-------------
 ggml.h                               |  29 ++
 tests/test-ggml-threading.c          |  61 +++-
 tests/test-ggml-tune.c               |  10 +-
 10 files changed, 435 insertions(+), 281 deletions(-)

diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index 55dd19275..da1d0a1c1 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -11,6 +11,10 @@
 
 #define UNUSED(x) (void)(x)
 
+// F16 has an pending Illegal Instruction error on macos-latest-cmake.
+// So the workaround is to disable non-quantized ftypes.
+// #define SUPPORT_NONE_Q_TYPE 1
+
 static void print_build_tips(void) {
     const char *a = "LLAMA_NO_ACCELERATE";
     fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
@@ -62,11 +66,12 @@ static void usage(char *prog) {
         "--model     MODEL    3B | 7B | 13B | 30B | 65B",
         "                     default 7B",
         "--ftype     FTYPE    ggml ftype:",
+#ifdef SUPPORT_NONE_Q_TYPE
         "                     0:  all F32",
         "                     1:  mostly F16",
+#endif
         "                     2:  mostly Q4_0",
         "                     3:  mostly Q4_1",
-        "                     4:  mostly Q4_1, some F16",
         "                     7:  mostly Q8_0",
         "                     8:  mostly Q5_0",
         "                     9:  mostly Q5_1",
@@ -84,7 +89,7 @@ static void usage(char *prog) {
         "                     requires: between [1, 3]",
         "--n_threads NTH      bench with this number of threads",
         "                     requires: between [1, 16]",
-        "                     default 1",
+        "                     default 4",
         "--file      FILE     data file to write",
         "                     default stdout",
         "-y                   always answer \"yes\" to all prompts",
@@ -170,8 +175,22 @@ int main(int argc, char **argv) {
             ftype = (enum ggml_ftype)v;
         }
 
+#ifndef SUPPORT_NONE_Q_TYPE
         if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
-            fprintf(stderr, "none quantized type %d is not supported\n", ftype);
+            fprintf(stderr, "error: none quantized type %d is not supported\n",
+                    ftype);
+            return 1;
+        }
+#endif
+
+        bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 &&
+                      ftype <= GGML_FTYPE_MOSTLY_Q4_1;
+        bool cond_2 =
+            ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K;
+
+        if (!(cond_1 || cond_2)) {
+            fprintf(stderr, "error: type %d is not a known ggml ftype.\n",
+                    ftype);
             return 1;
         }
     }
@@ -223,7 +242,7 @@ int main(int argc, char **argv) {
         }
     }
 
-    int n_threads = 1;
+    int n_threads = 4;
     {
         if (arg_n_threads != NULL) {
             int v = atoi(arg_n_threads);
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index c9151a8e4..2a1a04fca 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
 
     if (src0->type == GGML_TYPE_F32) {
         ggml_cl_mul_mat_f32(src0, src1, dst);
diff --git a/ggml-threading.c b/ggml-threading.c
index 6dd6d2817..7ef763c0f 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -170,7 +170,8 @@ struct ggml_compute_state_shared {
     atomic_bool wait_on_done;
     atomic_bool stop;
 
-    ggml_threading_task_runner *task_runner;
+    // Default task runner, can be overriden by node.task_profile.runner.
+    ggml_task_runner *task_runner;
 
     struct ggml_threading_context *ctx;
 };
@@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
         }
 
         if (shared->n_tasks > 0 && state->has_work) {
-            enum ggml_compute_error err =
-                shared->task_runner(&state->params, state->node);
+            ggml_task_runner *runner = state->node->task_profile.runner
+                                           ? state->node->task_profile.runner
+                                           : shared->task_runner;
+            enum ggml_compute_error err = runner(&state->params, state->node);
 
             GGML_ASSERT(err == GGML_COMPUTE_OK);
 
@@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               size_t wsize) {
     GGML_ASSERT(ctx);
     GGML_ASSERT(node);
-
     GGML_ASSERT(ctx->shared.task_runner);
+
+    ggml_task_runner *runner = ctx->shared.task_runner;
+    if (node->task_profile.runner) {
+        runner = node->task_profile.runner;
+    }
+
     struct ggml_compute_state_shared *state_shared = &ctx->shared;
 
     // This is the params for main thread.
@@ -491,7 +499,7 @@ START:
             params.wsize = wsize;
             params.wdata = wdata;
 
-            err = state_shared->task_runner(&params, node);
+            err = runner(&params, node);
         }
 
         // wait for tasks done.
@@ -509,11 +517,21 @@ START:
 
         if (err != GGML_COMPUTE_OK) {
             if (err == GGML_COMPUTE_FALLBACK) {
+                PRINT_DEBUG("[main] fallback from profile, id=%d\n",
+                            node->task_profile.id);
+                GGML_ASSERT(node->task_profile.stages[1].backend >
+                            GGML_TASK_BACKEND_CPU);
+
                 struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                 int n = ggml_get_task_profiles(node, profiles);
                 GGML_ASSERT(n > 0);
+                GGML_ASSERT(profiles[0].stages[1].backend ==
+                            GGML_TASK_BACKEND_CPU);
+
                 memcpy(&node->task_profile, &profiles[0],
                        sizeof(struct ggml_task_profile));
+                runner = ctx->shared.task_runner;
+
                 goto START;
             }
             return err;
@@ -525,12 +543,13 @@ START:
 
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]) {
     GGML_ASSERT(n_threads > 0);
-    GGML_ASSERT(thread_runner);
-    GGML_ASSERT(task_stage_runner);
+    if (thread_runner == NULL) {
+        thread_runner = ggml_threading_graph_compute_thread;
+    }
 
     size_t ctx_sz = sizeof(struct ggml_threading_context);
     struct ggml_threading_context *ctx = malloc(ctx_sz);
@@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
         .wait_now = false,
         .wait_on_done = false,
         .stop = false,
-        .task_runner = task_stage_runner,
+        .task_runner = task_runner,
         .ctx = ctx,
     };
 
diff --git a/ggml-threading.h b/ggml-threading.h
index f3214efc7..189fc2ed5 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -21,27 +21,21 @@ enum ggml_threading_features {
     GGML_THREADING_FEATURE_PERF = 1 << 1,
 };
 
-// Compute errors.
-enum ggml_compute_error {
-    GGML_COMPUTE_OK = 0,
-    GGML_COMPUTE_FALLBACK = 1,
-};
-
-// The task runner to be called by main thread and workers.
-typedef enum ggml_compute_error(ggml_threading_task_runner)(
-    struct ggml_compute_params *params, struct ggml_tensor *node);
-
 // The thread runner to feed into OS threads.
 typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 
 // Init and start underlying workers if n_threads > 1.
 //
-// features: optional for configure threading additional features.
-// see `ggml_threading_feature`, default 0.
+// thread: optional OS thread runner, default value:
+// `ggml_threading_graph_compute_thread`.
+//
+// features: optional for configure
+// threading additional features. see `ggml_threading_feature`, default 0.
+//
 // stages_time: optional for collecting per-stage wall clock time.
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]);
 
@@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
 // This is an experimental functionality for mulmat tune, as a thin wrapper.
 enum ggml_compute_error
-ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
                              struct ggml_tensor *tensor);
 
 #ifdef __cplusplus
diff --git a/ggml-tune.c b/ggml-tune.c
index 20f395069..aeb63e957 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
     }
 }
 
-const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
-    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
-    enum ggml_type src1_t, int stages_time[3]) {
+// NOTE: we can not use the profile from tune because the profiles do not
+// contain fields such as runner, get_size.
+int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                         int N, int K, enum ggml_type src0_t,
+                                         enum ggml_type src1_t,
+                                         int stages_time[3]) {
     GGML_ASSERT(tune);
 
     // TODO: default_mm_cache is thread-unsafe.
@@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                     names[i] = ggml_mulmat_tune_task_backend_name(
                         prof->stages[i].backend);
                 }
-                printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
-                       "fastest profile: %s %s %s\n",
-                       M, N, K, names[0], names[1], names[2]);
+                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
+                       "backends: %s %s %s\n",
+                       M, N, K, prof->id, names[0], names[1], names[2]);
 #endif
             }
         }
     }
 
-    return prof;
+    return prof->id;
 }
 
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
         if (shape->m_num > 0) {
             if (shape->arr_m) {
                 free(shape->arr_m);
+                shape->arr_m = NULL;
             }
             if (shape->items) {
                 free(shape->items);
+                shape->items = NULL;
             }
+            shape->m_num = 0;
         }
     }
 }
@@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles(
     int rc;
     for (int i = 0; i < n_profiles; i++) {
         const struct ggml_task_profile *profile = &profiles[i];
+        rc = fprintf(fp, "%d ", profile->id);
+        if (rc <= 0) {
+            return false;
+        }
+
         for (int j = 0; j < 3; j++) {
             const struct ggml_task_stage *ts = &profile->stages[j];
             rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
@@ -304,7 +315,6 @@ static bool
 ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
                                    const char *model, int ftype, int n_threads,
                                    char *errbuf, int errbuf_len) {
-
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
         snprintf(errbuf, errbuf_len - 1,
                  "version mismatch, built-in: %d, "
@@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
         int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (n_profiles)");
             return false;
         }
 
         // TODO: profiles order is relevant, too strict.
-        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
-        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
+        // Only validate stages!
+        size_t sz = sizeof(struct ggml_task_stage) * 3;
+        bool matched = true;
+        for (int j = 0; j < n_profiles; j++) {
+            if (builtin_profiles[j].id != shape->profiles[j].id) {
+                return false;
+            }
+            if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
+                       sz) != 0) {
+                matched = false;
+                break;
+            }
+        }
+        if (!matched) {
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (profiles)");
 
             printf("=== built-in profiles:\n");
             ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
 
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
+
+            rc = fscanf(fp, "%d ", &profile->id);
+            if (rc <= 0) {
+                return false;
+            }
+
             for (int j = 0; j < 3; j++) {
                 struct ggml_task_stage *ts = &profile->stages[j];
                 int backend;
@@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     GGML_ASSERT(params);
     GGML_ASSERT(params->model.name);
 
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
     enum ggml_task_backend backends[16];
     int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
     if (n_backends < 2) {
@@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         return false;
     }
 
+    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
+        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
+#if defined(GGML_USE_CLBLAST)
+        printf("[tune] error: cl implementation does not support k_quants at "
+               "the time of writing this code, skip.\n");
+        return false;
+#endif
+    }
+
     bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
@@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     int64_t t0 = ggml_time_ms();
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, ggml_threading_graph_compute_thread,
-        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
-        stages_time);
+        tune->n_threads, NULL, ggml_compute_forward_wrapper,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
diff --git a/ggml-tune.h b/ggml-tune.h
index b12466155..7955a50a9 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -10,7 +10,7 @@
 extern "C" {
 #endif
 
-#define GGML_MULMAT_TUNE_VERSION 8
+#define GGML_MULMAT_TUNE_VERSION 9
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16
 
@@ -55,7 +55,7 @@ struct ggml_mulmat_tune_shape {
     struct ggml_mulmat_tune_m *items;
 };
 
- struct ggml_mulmat_tune_cache_ele {
+struct ggml_mulmat_tune_cache_ele {
     int M;
     int N;
     int K;
@@ -98,10 +98,11 @@ struct ggml_mulmat_tune_params {
 };
 
 // NOTE: stages_time is filled if not null.
-const struct ggml_task_profile *
-ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
-                                     int N, int K, enum ggml_type src0_t,
-                                     enum ggml_type src1_t, int stages_time[3]);
+// Return profile id.
+int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                         int N, int K, enum ggml_type src0_t,
+                                         enum ggml_type src1_t,
+                                         int stages_time[3]);
 
 bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
                                const char *model_name, int ftype,
diff --git a/ggml.c b/ggml.c
index b734f1a0c..43ec93a64 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8500,19 +8500,14 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #ifdef GGML_USE_CLBLAST
-        if (src1->backend == GGML_BACKEND_GPU) {
-            if (ith == 0) {
-                ggml_cl_mul(src0, src1, dst);
-            }
-            return;
+    if (src1->backend == GGML_BACKEND_GPU) {
+        if (ith == 0) {
+            ggml_cl_mul(src0, src1, dst);
         }
-#else
-        GGML_ASSERT(false);
+        return;
+    }
 #endif
-    };
 
     const int64_t nr = ggml_nrows(src0);
 
@@ -9938,7 +9933,7 @@ static void ggml_compute_forward_rms_norm_back(
     }
 }
 
-
+// CPU only
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10010,18 +10005,6 @@ static void ggml_compute_forward_mul_mat_f32(
     //   compute by src0 rows
 
     enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -10104,6 +10087,7 @@ static void ggml_compute_forward_mul_mat_f32(
     //}
 }
 
+// CPU only.
 static void ggml_compute_forward_mul_mat_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10168,19 +10152,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
     enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
 
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -10304,6 +10278,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     //}
 }
 
+// CPU only
 static void ggml_compute_forward_mul_mat_q_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10373,20 +10348,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -14294,30 +14257,9 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
-static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
-    enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-#if defined(GGML_USE_CUBLAS)
-        bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-        if (skip_cpu) {
-            return GGML_COMPUTE_OK;
-        }
-        GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-        GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
-        return GGML_COMPUTE_FALLBACK;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) {
-    //     printf("mulmat: test fallback\n");
-    //     return GGML_COMPUTE_FALLBACK;
-    // }
-
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -14568,13 +14510,6 @@ static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params *
     return GGML_COMPUTE_OK;
 }
 
-enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params,
-    struct ggml_tensor *tensor) {
-    // We call ggml_compute_forward because the CUDA mul_mat entry point
-    // was moved out of `ggml_compute_forward_mul_mat`.
-    return ggml_compute_forward(params, tensor);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
@@ -15524,12 +15459,67 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
 // ---- task profiles  ----
 
+// TODO: replace with ggml_compute_forward_cuda
+// DO NOT check matrix size further.
+#if defined(GGML_USE_CUBLAS)
+static enum ggml_compute_error ggml_compute_forward_cuda(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * tensor) {
+    GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor));
+    if (ggml_cuda_compute_forward(params, tensor)) {
+        return GGML_COMPUTE_OK;
+    }
+    GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    return GGML_COMPUTE_FALLBACK;
+}
+#endif
+
+// TODO: replace with ggml_cl_mul_mat.
+// DO NOT check matrix size further.
+#if defined(GGML_USE_CLBLAST)
+static enum ggml_compute_error ggml_compute_forward_cl(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * tensor) {
+    switch (tensor->op) {
+        case GGML_OP_MUL_MAT:
+            GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor));
+            ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
+            return GGML_COMPUTE_OK;
+        default:
+            break;
+    }
+
+    GGML_ASSERT(false);
+}
+
+static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) {
+     switch (tensor->op) {
+        case GGML_OP_MUL_MAT:
+            return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
+        default:
+            break;
+     }
+     return -1;
+}
+#endif
+
+// The wrapper for external mulmat tune tool.
+enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
+    struct ggml_tensor *tensor) {
+    // We call ggml_compute_forward because the CUDA mul_mat entry point
+    // was moved out of `ggml_compute_forward_mul_mat`.
+    return ggml_compute_forward(params, tensor);
+}
+
 // Implement `ggml_task_profiles_provider`.
 // Fill `profiles` for the `node` and return number of profiles.
 //
 // NOTE: the node may be incompleted from testing or tunning, so please assert
 //       everything used here.
-inline int ggml_get_task_profiles(
+//
+// TODO: configure cuda for none mul_mat nodes.
+int ggml_get_task_profiles(
     struct ggml_tensor *node,
     struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
     GGML_ASSERT(node);
@@ -15595,6 +15585,8 @@ inline int ggml_get_task_profiles(
     } break;
     case GGML_OP_MUL_MAT:
     case GGML_OP_OUT_PROD: {
+        // CPU only profiles.
+        // CUDA/CL: see end of function.
         GGML_ASSERT(node->src0);
         GGML_ASSERT(node->src1);
 
@@ -15614,16 +15606,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         } else if (src0_t == GGML_TYPE_F16) {
             p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
             p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
@@ -15635,16 +15617,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         } else if (ggml_is_quantized(src0_t)) {
             p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
             p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
@@ -15658,16 +15630,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         }
         n_profiles = i;
     } break;
@@ -15757,7 +15719,43 @@ inline int ggml_get_task_profiles(
         GGML_ASSERT(false);
     }
 
+#if defined(GGML_USE_CUBLAS)
+    switch (node->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_SILU:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_ROPE: {
+            int i = n_profiles;
+            p[i].runner = ggml_compute_forward_cuda;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+            p[i].stages[1].wait = true;
+            ++n_profiles;
+        } break;
+        default: {
+        } break;
+    }
+#elif defined(GGML_USE_CLBLAST)
+    switch (node->op) {
+        case GGML_OP_MUL_MAT: {
+            int i = n_profiles;
+            p[i].runner = ggml_compute_forward_cl;
+            p[i].get_wsize = ggml_compute_forward_get_wsize_cl;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+            p[i].stages[1].wait = true;
+            ++n_profiles;
+        } break;
+        default: {
+        } break;
+    }
+#endif
+
     GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
+    for (int i = 0; i < n_profiles; i++) {
+        profiles[i].id = i + 1;
+    }
     return n_profiles;
 }
 
@@ -15769,7 +15767,7 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     GGML_ASSERT(node);
     GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
     GGML_ASSERT(profiles);
-    GGML_ASSERT(n_profiles >= 2);
+    GGML_ASSERT(n_profiles > 0);
 
     enum ggml_type src0_t = node->src0->type;
     enum ggml_type src1_t = node->src1->type;
@@ -15777,9 +15775,9 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     // Type and memory layout requirements for computing mul_mat with BLAS.
     bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
                        ggml_is_quantized(src0_t)) &&
-                      src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                      ggml_is_contiguous(node->src0) &&
-                      ggml_is_contiguous(node->src1);
+                       src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                       ggml_is_contiguous(node->src0) &&
+                       ggml_is_contiguous(node->src1);
 
     int M = (int)node->ne[1];
     int N = (int)node->ne[0];
@@ -15790,10 +15788,14 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     if (cond_match) {
 #if defined(GGML_USE_TUNE)
         if (tune != NULL) {
-            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+            GGML_ASSERT(n_profiles >= 2);
+            int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
                                                         src1_t, stages_time_us);
-            if (prof != NULL) {
-                return prof;
+            for (int i = 0; i < n_profiles; i++) {
+                if (profiles[i].id == id) {
+                    prof = &profiles[i];
+                    return prof;
+                }
             }
         }
 #else
@@ -15841,11 +15843,101 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     return prof;
 }
 
+void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node,
+                                               struct ggml_cgraph *cgraph) {
+    // Pre-specified.
+    for (int i = 0; i < 3; i++) {
+        if (node->task_profile.stages[i].backend > 0) {
+            return;
+        }
+    }
+
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+    int n_profiles = ggml_get_task_profiles(node, profiles);
+
+    const struct ggml_task_profile *profile = NULL;
+
+    // GPU offloading. A special case of pre-specified task_profile.
+    if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) {
+        if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) {
+            enum ggml_task_backend be;
+            if (ggml_cpu_has_cublas()) {
+                be = GGML_TASK_BACKEND_GPU_CUDA;
+            } else if (ggml_cpu_has_clblast()) {
+                be = GGML_TASK_BACKEND_GPU_CL;
+            } else {
+                GGML_ASSERT(false);
+            }
+
+            for (int j = 0; j < n_profiles; j++) {
+                if (profiles[j].stages[1].backend == be) {
+                    profile = &profiles[j];
+                    break;
+                }
+            }
+            GGML_ASSERT(profile);
+            GGML_ASSERT(!cgraph->tune);
+
+            memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+            return;
+        }
+    }
+
+    // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD.
+    if (node->op == GGML_OP_MUL_MAT) {
+#if defined(GGML_USE_TUNE)
+        GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
+
+        int stages_time_us[3];
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles,
+                                               cgraph->tune, stages_time_us);
+        GGML_ASSERT(profile);
+
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+
+        if (cgraph->tune) {
+            memcpy(&node->task_profile, profile,
+                    sizeof(struct ggml_task_profile));
+
+            // Do not wait if the estimated execution time is too small
+            // (e.g. less than 0.1 ms)
+            // TODO: need bench actual wait/notify time, see
+            // ggml-threading.c
+            for (int j = 0; j < 3; j++) {
+                if (node->task_profile.stages[j].wait) {
+                    if (stages_time_us[j] < 100) {
+                        node->task_profile.stages[j].wait = false;
+                    }
+                }
+            }
+        }
+        return;
+#else
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
+                                               NULL);
+        GGML_ASSERT(profile);
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+        return;
+#endif
+    } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct?
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
+                                               NULL);
+        GGML_ASSERT(profile);
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+        return;
+    }
+
+    // default.
+    profile = &profiles[0];
+    GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU);
+    memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+}
+
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     int n_threads = cgraph->n_threads;
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
+        n_threads, NULL, ggml_compute_forward,
         GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
 
     // initialize tasks + work buffer
@@ -15854,107 +15946,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         size_t work_size = 0;
 
-        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
-
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
-            if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
-                continue;
-            }
 
-            int n_profiles = ggml_get_task_profiles(node, profiles);
-
-            const struct ggml_task_profile *profile = NULL;
-
-            // Adapt node->backend: assume GPU at COMPUTE stage.
-            if (node->backend == GGML_BACKEND_GPU ||
-                node->backend == GGML_BACKEND_GPU_SPLIT) {
-                enum ggml_task_backend be;
-                if (ggml_cpu_has_cublas()) {
-                    be = GGML_TASK_BACKEND_GPU_CUDA;
-                } else if (ggml_cpu_has_clblast()) {
-                    be = GGML_TASK_BACKEND_GPU_CL;
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                for (int j = 0; j < n_profiles; j++) {
-                    if (profiles[j].stages[1].backend == be) {
-                        profile = &profiles[j];
-                        break;
-                    }
-                }
-                GGML_ASSERT(profile);
-            } else {
-                GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
-            }
-
-            bool profile_copied = false;
-
-            if (node->op == GGML_OP_MUL_MAT) {
-#if defined(GGML_USE_TUNE)
-                int stages_time_us[3];
-                profile = ggml_mulmat_get_task_profile(
-                    node, profiles, n_profiles, cgraph->tune, stages_time_us);
-                GGML_ASSERT(profile);
-
-                if (cgraph->tune) {
-                    memcpy(&node->task_profile, profile,
-                           sizeof(struct ggml_task_profile));
-                    profile_copied = true;
-
-                    // Do not wait if the estimated execution time is too small
-                    // (e.g. less than 0.1 ms)
-                    // TODO: need bench actual wait/notify time, see
-                    // ggml-threading.c
-                    for (int j = 0; j< 3; j++) {
-                        if (node->task_profile.stages[j].wait) {
-                            if (stages_time_us[j] < 100) {
-                                node->task_profile.stages[j].wait = false;
-                            }
-                        }
-                    }
-                }
-#else
-                profile = ggml_mulmat_get_task_profile(node, profiles,
-                                                       n_profiles, NULL, NULL);
-                GGML_ASSERT(profile);
-#endif
-            } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
-                profile = ggml_mulmat_get_task_profile(node, profiles,
-                                                       n_profiles, NULL, NULL);
-                GGML_ASSERT(profile);
-            } else {
-                profile = &profiles[0];
-                GGML_ASSERT(profile->stages[1].backend ==
-                            GGML_TASK_BACKEND_CPU);
-            }
-
-            if (!profile_copied) {
-                memcpy(&node->task_profile, profile,
-                       sizeof(struct ggml_task_profile));
-            }
+            GGML_ASSERT (node->op != GGML_OP_NONE);
 
             struct ggml_task_stage *stages = node->task_profile.stages;
 
-            // Workrounnd to set node->backend.
-            for (int j = 0; j < 3; j++) {
-                if (node->backend == GGML_BACKEND_CPU &&
-                    (stages[j].backend & GGML_TASK_BACKEND_GPU)) {
-                    if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) {
-                        node->backend = GGML_BACKEND_GPU;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
+            ggml_graph_compute_set_tensor_task_proile(node, cgraph);
+
+            //
+            // Allocate temp buffer `wdata` for CPU.
+            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
+            //
+
+            if (node->task_profile.get_wsize) {
+                int sz = node->task_profile.get_wsize(node);
+                if (sz >= 0) {
+                    work_size = MAX(work_size, (size_t)sz);
+                    continue;
                 }
             }
 
+            //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend);
+
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
-            // Allocate temp buffer `wdata` for CPU.
-            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
@@ -16012,20 +16031,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                     } break;
                 case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD: // FIXME: is is right?
+                case GGML_OP_OUT_PROD: // FIXME: is this correct?
                     {
                         size_t cur = 0;
                         enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
                         GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
-
-                        if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-                            GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node));
-                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-#else
-                            GGML_ASSERT(false);
-#endif
-                        } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                        if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
                             GGML_ASSERT(ggml_cpu_has_cpublas());
                             GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
 
@@ -16039,11 +16050,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else {
                                 GGML_ASSERT(false);
                             }
-                        } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                            if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                                GGML_ASSERT(ggml_cpu_has_cublas());
-                            }
-
+                        } else { // CPU or GPU fallback
                             GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
 
                             if (node->src0->type == GGML_TYPE_F32) {
@@ -16056,8 +16063,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else {
                                 GGML_ASSERT(false);
                             }
-                        } else {
-                            GGML_ASSERT(false);
                         }
 
                         work_size = MAX(work_size, cur);
diff --git a/ggml.h b/ggml.h
index 5ab78c4a0..d4d5d3521 100644
--- a/ggml.h
+++ b/ggml.h
@@ -390,11 +390,40 @@ extern "C" {
         bool wait;
     };
 
+    struct ggml_tensor;
+    struct ggml_compute_params;
+
+    // Compute errors.
+    enum ggml_compute_error {
+        GGML_COMPUTE_OK = 0,
+        GGML_COMPUTE_FALLBACK = 1,
+    };
+
+    // The task runner to be called by main thread and workers.
+    typedef enum ggml_compute_error(ggml_task_runner)(
+        const struct ggml_compute_params *params,
+        struct ggml_tensor *node);
+
+    // Get wsize for node computing.
+    // When return -1: should be explained as `fallback to CPU`, caller MUST
+    // determine how much memory to reserve for this node.
+    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+
     // config for computing a tensor.
     struct ggml_task_profile {
+        // profile id, start from 1.
+        int id;
+
         // index 0: INIT, 1: COMPUTE, 2: FINALIZE
         struct ggml_task_stage stages[3];
 
+        // Optional task runner, overrides threading's task runner.
+        ggml_task_runner *runner;
+
+        // Optional function to return required wsize for wdata.
+        ggml_task_get_wsize *get_wsize;
+
+        // Optional flag for development.
         // MUST be used only in testing codes.
         uint8_t dev_flags[4];
     };
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 90d53e4cd..2079fe144 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -42,7 +42,8 @@ static const int n_repeat = 10;
 static int work_done_arr[MAX_N_THREADS];
 
 static enum ggml_compute_error
-mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
+mock_task_runner(const struct ggml_compute_params *params,
+                 struct ggml_tensor *node) {
     int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
     if (node->task_profile.stages[params->type].parallel) {
         loops /= params->nth;
@@ -79,9 +80,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t0 = (int)ggml_time_us();
 
-    struct ggml_threading_context *ctx =
-        ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                             mock_task_runner, features, /*stages_time*/ NULL);
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
 
     int t1 = (int)ggml_time_us();
 
@@ -141,7 +141,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 }
 
 static enum ggml_compute_error
-mock_task_runner_fallback(struct ggml_compute_params *params,
+mock_task_runner_fallback(const struct ggml_compute_params *params,
                           struct ggml_tensor *node) {
     UNUSED(params);
     if (node->backend == GGML_BACKEND_GPU) {
@@ -158,7 +158,7 @@ mock_task_runner_fallback(struct ggml_compute_params *params,
 // thus it is not parallelled.
 int test_fallback(struct ggml_tensor *node) {
     struct ggml_threading_context *ctx = ggml_threading_start(
-        1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
+        1, NULL, mock_task_runner_fallback,
         /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
 
     enum ggml_compute_error err =
@@ -177,6 +177,38 @@ int test_fallback(struct ggml_tensor *node) {
     return 0;
 }
 
+static enum ggml_compute_error
+customized_node_runner(const struct ggml_compute_params *params,
+                       struct ggml_tensor *node) {
+    UNUSED(params);
+    // Reset runner thus caller will know it was called.
+    node->task_profile.runner = NULL;
+    return GGML_COMPUTE_OK;
+}
+
+// Test when node->task_profile.runner is not NULL.
+int test_customized_node_runner(struct ggml_tensor *node) {
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        1, NULL, mock_task_runner,
+        /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
+
+    node->task_profile.runner = customized_node_runner;
+    enum ggml_compute_error err =
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+
+    ggml_threading_stop(ctx);
+    if (err != GGML_COMPUTE_OK) {
+        // should not happen.
+        abort();
+    }
+
+    if (node->task_profile.runner != NULL) {
+        return 2;
+    }
+
+    return 0;
+}
+
 int main(void) {
     ggml_time_init();
 
@@ -367,7 +399,10 @@ int main(void) {
         }
     }
 
+    // fallback
     {
+        printf("[test-ggml-threading] test fallback ...\n");
+
         ++n_tests;
 
         // required by getting task profiles.
@@ -382,9 +417,21 @@ int main(void) {
         node.src1 = &src1;
 
         node.backend = GGML_BACKEND_GPU;
+        stages[1].backend = GGML_TASK_BACKEND_GPU;
         if (test_fallback(&node) == 0) {
             ++n_passed;
-            printf("\n[test-ggml-threading] test fallback: ok\n\n");
+            printf("[test-ggml-threading] test fallback: ok\n\n");
+        }
+    }
+
+    // customized node runner
+    {
+        printf("[test-ggml-threading] test customized node runner ...\n");
+        ++n_tests;
+
+        if (test_customized_node_runner(&node) == 0) {
+            ++n_passed;
+            printf("[test-ggml-threading] test customized node runner: ok\n\n");
         }
     }
 
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index 5499fa6bf..4339881e5 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -72,7 +72,9 @@ static int bench(void) {
         // GGML_FTYPE_ALL_F32,
         // GGML_FTYPE_MOSTLY_F16,
         GGML_FTYPE_MOSTLY_Q4_0,
+#if defined(GGML_USE_K_QUANTS)
         GGML_FTYPE_MOSTLY_Q4_K,
+#endif
     };
 
     int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
@@ -132,16 +134,14 @@ int estimate_time_non_zero_NK(void) {
         int time[3]; // 3 profiles.
     };
 
-    struct ggml_mulmat_tune tune = {
-        .version = 1,
-        .ftype = GGML_FTYPE_MOSTLY_Q4_0,
-    };
+    struct ggml_mulmat_tune tune;
 
+    enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
     const int m_num = 2;
     const int n_threads = 1; // useless.
 
     struct ggml_mulmat_tune_params params;
-    init_params(&params, tune.ftype, m_num, n_threads);
+    init_params(&params, ftype, m_num, n_threads);
 
     ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);