mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-07 00:43:14 +01:00
try make CL run w/o tunning, but -ngl stucks no output. had to add task runer and profile id, many changes, see the f codes
This commit is contained in:
parent
5342dc075f
commit
6b83a3e16f
@ -11,6 +11,10 @@
|
|||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
|
// F16 has an pending Illegal Instruction error on macos-latest-cmake.
|
||||||
|
// So the workaround is to disable non-quantized ftypes.
|
||||||
|
// #define SUPPORT_NONE_Q_TYPE 1
|
||||||
|
|
||||||
static void print_build_tips(void) {
|
static void print_build_tips(void) {
|
||||||
const char *a = "LLAMA_NO_ACCELERATE";
|
const char *a = "LLAMA_NO_ACCELERATE";
|
||||||
fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
|
fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
|
||||||
@ -62,11 +66,12 @@ static void usage(char *prog) {
|
|||||||
"--model MODEL 3B | 7B | 13B | 30B | 65B",
|
"--model MODEL 3B | 7B | 13B | 30B | 65B",
|
||||||
" default 7B",
|
" default 7B",
|
||||||
"--ftype FTYPE ggml ftype:",
|
"--ftype FTYPE ggml ftype:",
|
||||||
|
#ifdef SUPPORT_NONE_Q_TYPE
|
||||||
" 0: all F32",
|
" 0: all F32",
|
||||||
" 1: mostly F16",
|
" 1: mostly F16",
|
||||||
|
#endif
|
||||||
" 2: mostly Q4_0",
|
" 2: mostly Q4_0",
|
||||||
" 3: mostly Q4_1",
|
" 3: mostly Q4_1",
|
||||||
" 4: mostly Q4_1, some F16",
|
|
||||||
" 7: mostly Q8_0",
|
" 7: mostly Q8_0",
|
||||||
" 8: mostly Q5_0",
|
" 8: mostly Q5_0",
|
||||||
" 9: mostly Q5_1",
|
" 9: mostly Q5_1",
|
||||||
@ -84,7 +89,7 @@ static void usage(char *prog) {
|
|||||||
" requires: between [1, 3]",
|
" requires: between [1, 3]",
|
||||||
"--n_threads NTH bench with this number of threads",
|
"--n_threads NTH bench with this number of threads",
|
||||||
" requires: between [1, 16]",
|
" requires: between [1, 16]",
|
||||||
" default 1",
|
" default 4",
|
||||||
"--file FILE data file to write",
|
"--file FILE data file to write",
|
||||||
" default stdout",
|
" default stdout",
|
||||||
"-y always answer \"yes\" to all prompts",
|
"-y always answer \"yes\" to all prompts",
|
||||||
@ -170,8 +175,22 @@ int main(int argc, char **argv) {
|
|||||||
ftype = (enum ggml_ftype)v;
|
ftype = (enum ggml_ftype)v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef SUPPORT_NONE_Q_TYPE
|
||||||
if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
|
if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
|
||||||
fprintf(stderr, "none quantized type %d is not supported\n", ftype);
|
fprintf(stderr, "error: none quantized type %d is not supported\n",
|
||||||
|
ftype);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 &&
|
||||||
|
ftype <= GGML_FTYPE_MOSTLY_Q4_1;
|
||||||
|
bool cond_2 =
|
||||||
|
ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K;
|
||||||
|
|
||||||
|
if (!(cond_1 || cond_2)) {
|
||||||
|
fprintf(stderr, "error: type %d is not a known ggml ftype.\n",
|
||||||
|
ftype);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -223,7 +242,7 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_threads = 1;
|
int n_threads = 4;
|
||||||
{
|
{
|
||||||
if (arg_n_threads != NULL) {
|
if (arg_n_threads != NULL) {
|
||||||
int v = atoi(arg_n_threads);
|
int v = atoi(arg_n_threads);
|
||||||
|
@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
|
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||||
// GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
|
GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
ggml_cl_mul_mat_f32(src0, src1, dst);
|
ggml_cl_mul_mat_f32(src0, src1, dst);
|
||||||
|
@ -170,7 +170,8 @@ struct ggml_compute_state_shared {
|
|||||||
atomic_bool wait_on_done;
|
atomic_bool wait_on_done;
|
||||||
atomic_bool stop;
|
atomic_bool stop;
|
||||||
|
|
||||||
ggml_threading_task_runner *task_runner;
|
// Default task runner, can be overriden by node.task_profile.runner.
|
||||||
|
ggml_task_runner *task_runner;
|
||||||
|
|
||||||
struct ggml_threading_context *ctx;
|
struct ggml_threading_context *ctx;
|
||||||
};
|
};
|
||||||
@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (shared->n_tasks > 0 && state->has_work) {
|
if (shared->n_tasks > 0 && state->has_work) {
|
||||||
enum ggml_compute_error err =
|
ggml_task_runner *runner = state->node->task_profile.runner
|
||||||
shared->task_runner(&state->params, state->node);
|
? state->node->task_profile.runner
|
||||||
|
: shared->task_runner;
|
||||||
|
enum ggml_compute_error err = runner(&state->params, state->node);
|
||||||
|
|
||||||
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||||
|
|
||||||
@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
|||||||
size_t wsize) {
|
size_t wsize) {
|
||||||
GGML_ASSERT(ctx);
|
GGML_ASSERT(ctx);
|
||||||
GGML_ASSERT(node);
|
GGML_ASSERT(node);
|
||||||
|
|
||||||
GGML_ASSERT(ctx->shared.task_runner);
|
GGML_ASSERT(ctx->shared.task_runner);
|
||||||
|
|
||||||
|
ggml_task_runner *runner = ctx->shared.task_runner;
|
||||||
|
if (node->task_profile.runner) {
|
||||||
|
runner = node->task_profile.runner;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_compute_state_shared *state_shared = &ctx->shared;
|
struct ggml_compute_state_shared *state_shared = &ctx->shared;
|
||||||
|
|
||||||
// This is the params for main thread.
|
// This is the params for main thread.
|
||||||
@ -491,7 +499,7 @@ START:
|
|||||||
params.wsize = wsize;
|
params.wsize = wsize;
|
||||||
params.wdata = wdata;
|
params.wdata = wdata;
|
||||||
|
|
||||||
err = state_shared->task_runner(¶ms, node);
|
err = runner(¶ms, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for tasks done.
|
// wait for tasks done.
|
||||||
@ -509,11 +517,21 @@ START:
|
|||||||
|
|
||||||
if (err != GGML_COMPUTE_OK) {
|
if (err != GGML_COMPUTE_OK) {
|
||||||
if (err == GGML_COMPUTE_FALLBACK) {
|
if (err == GGML_COMPUTE_FALLBACK) {
|
||||||
|
PRINT_DEBUG("[main] fallback from profile, id=%d\n",
|
||||||
|
node->task_profile.id);
|
||||||
|
GGML_ASSERT(node->task_profile.stages[1].backend >
|
||||||
|
GGML_TASK_BACKEND_CPU);
|
||||||
|
|
||||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||||
int n = ggml_get_task_profiles(node, profiles);
|
int n = ggml_get_task_profiles(node, profiles);
|
||||||
GGML_ASSERT(n > 0);
|
GGML_ASSERT(n > 0);
|
||||||
|
GGML_ASSERT(profiles[0].stages[1].backend ==
|
||||||
|
GGML_TASK_BACKEND_CPU);
|
||||||
|
|
||||||
memcpy(&node->task_profile, &profiles[0],
|
memcpy(&node->task_profile, &profiles[0],
|
||||||
sizeof(struct ggml_task_profile));
|
sizeof(struct ggml_task_profile));
|
||||||
|
runner = ctx->shared.task_runner;
|
||||||
|
|
||||||
goto START;
|
goto START;
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
@ -525,12 +543,13 @@ START:
|
|||||||
|
|
||||||
struct ggml_threading_context *
|
struct ggml_threading_context *
|
||||||
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
|
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
|
||||||
ggml_threading_task_runner *task_stage_runner,
|
ggml_task_runner *task_runner,
|
||||||
enum ggml_threading_features features,
|
enum ggml_threading_features features,
|
||||||
int64_t stages_time[3]) {
|
int64_t stages_time[3]) {
|
||||||
GGML_ASSERT(n_threads > 0);
|
GGML_ASSERT(n_threads > 0);
|
||||||
GGML_ASSERT(thread_runner);
|
if (thread_runner == NULL) {
|
||||||
GGML_ASSERT(task_stage_runner);
|
thread_runner = ggml_threading_graph_compute_thread;
|
||||||
|
}
|
||||||
|
|
||||||
size_t ctx_sz = sizeof(struct ggml_threading_context);
|
size_t ctx_sz = sizeof(struct ggml_threading_context);
|
||||||
struct ggml_threading_context *ctx = malloc(ctx_sz);
|
struct ggml_threading_context *ctx = malloc(ctx_sz);
|
||||||
@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
|
|||||||
.wait_now = false,
|
.wait_now = false,
|
||||||
.wait_on_done = false,
|
.wait_on_done = false,
|
||||||
.stop = false,
|
.stop = false,
|
||||||
.task_runner = task_stage_runner,
|
.task_runner = task_runner,
|
||||||
.ctx = ctx,
|
.ctx = ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -21,27 +21,21 @@ enum ggml_threading_features {
|
|||||||
GGML_THREADING_FEATURE_PERF = 1 << 1,
|
GGML_THREADING_FEATURE_PERF = 1 << 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Compute errors.
|
|
||||||
enum ggml_compute_error {
|
|
||||||
GGML_COMPUTE_OK = 0,
|
|
||||||
GGML_COMPUTE_FALLBACK = 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
// The task runner to be called by main thread and workers.
|
|
||||||
typedef enum ggml_compute_error(ggml_threading_task_runner)(
|
|
||||||
struct ggml_compute_params *params, struct ggml_tensor *node);
|
|
||||||
|
|
||||||
// The thread runner to feed into OS threads.
|
// The thread runner to feed into OS threads.
|
||||||
typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
|
typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
|
||||||
|
|
||||||
// Init and start underlying workers if n_threads > 1.
|
// Init and start underlying workers if n_threads > 1.
|
||||||
//
|
//
|
||||||
// features: optional for configure threading additional features.
|
// thread: optional OS thread runner, default value:
|
||||||
// see `ggml_threading_feature`, default 0.
|
// `ggml_threading_graph_compute_thread`.
|
||||||
|
//
|
||||||
|
// features: optional for configure
|
||||||
|
// threading additional features. see `ggml_threading_feature`, default 0.
|
||||||
|
//
|
||||||
// stages_time: optional for collecting per-stage wall clock time.
|
// stages_time: optional for collecting per-stage wall clock time.
|
||||||
struct ggml_threading_context *
|
struct ggml_threading_context *
|
||||||
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
|
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
|
||||||
ggml_threading_task_runner *task_stage_runner,
|
ggml_task_runner *task_runner,
|
||||||
enum ggml_threading_features features,
|
enum ggml_threading_features features,
|
||||||
int64_t stages_time[3]);
|
int64_t stages_time[3]);
|
||||||
|
|
||||||
@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
|
|||||||
|
|
||||||
// This is an experimental functionality for mulmat tune, as a thin wrapper.
|
// This is an experimental functionality for mulmat tune, as a thin wrapper.
|
||||||
enum ggml_compute_error
|
enum ggml_compute_error
|
||||||
ggml_compute_forward_wrapper(struct ggml_compute_params *params,
|
ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
|
||||||
struct ggml_tensor *tensor);
|
struct ggml_tensor *tensor);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
70
ggml-tune.c
70
ggml-tune.c
@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
// NOTE: we can not use the profile from tune because the profiles do not
|
||||||
struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
|
// contain fields such as runner, get_size.
|
||||||
enum ggml_type src1_t, int stages_time[3]) {
|
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
||||||
|
int N, int K, enum ggml_type src0_t,
|
||||||
|
enum ggml_type src1_t,
|
||||||
|
int stages_time[3]) {
|
||||||
GGML_ASSERT(tune);
|
GGML_ASSERT(tune);
|
||||||
|
|
||||||
// TODO: default_mm_cache is thread-unsafe.
|
// TODO: default_mm_cache is thread-unsafe.
|
||||||
@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
|||||||
names[i] = ggml_mulmat_tune_task_backend_name(
|
names[i] = ggml_mulmat_tune_task_backend_name(
|
||||||
prof->stages[i].backend);
|
prof->stages[i].backend);
|
||||||
}
|
}
|
||||||
printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
|
printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
|
||||||
"fastest profile: %s %s %s\n",
|
"backends: %s %s %s\n",
|
||||||
M, N, K, names[0], names[1], names[2]);
|
M, N, K, prof->id, names[0], names[1], names[2]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return prof;
|
return prof->id;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
|
||||||
@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
|
|||||||
if (shape->m_num > 0) {
|
if (shape->m_num > 0) {
|
||||||
if (shape->arr_m) {
|
if (shape->arr_m) {
|
||||||
free(shape->arr_m);
|
free(shape->arr_m);
|
||||||
|
shape->arr_m = NULL;
|
||||||
}
|
}
|
||||||
if (shape->items) {
|
if (shape->items) {
|
||||||
free(shape->items);
|
free(shape->items);
|
||||||
|
shape->items = NULL;
|
||||||
}
|
}
|
||||||
|
shape->m_num = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles(
|
|||||||
int rc;
|
int rc;
|
||||||
for (int i = 0; i < n_profiles; i++) {
|
for (int i = 0; i < n_profiles; i++) {
|
||||||
const struct ggml_task_profile *profile = &profiles[i];
|
const struct ggml_task_profile *profile = &profiles[i];
|
||||||
|
rc = fprintf(fp, "%d ", profile->id);
|
||||||
|
if (rc <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (int j = 0; j < 3; j++) {
|
for (int j = 0; j < 3; j++) {
|
||||||
const struct ggml_task_stage *ts = &profile->stages[j];
|
const struct ggml_task_stage *ts = &profile->stages[j];
|
||||||
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
|
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
|
||||||
@ -304,7 +315,6 @@ static bool
|
|||||||
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
|
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
|
||||||
const char *model, int ftype, int n_threads,
|
const char *model, int ftype, int n_threads,
|
||||||
char *errbuf, int errbuf_len) {
|
char *errbuf, int errbuf_len) {
|
||||||
|
|
||||||
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
|
||||||
snprintf(errbuf, errbuf_len - 1,
|
snprintf(errbuf, errbuf_len - 1,
|
||||||
"version mismatch, built-in: %d, "
|
"version mismatch, built-in: %d, "
|
||||||
@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
|
|||||||
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
|
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
|
||||||
|
|
||||||
if (n_profiles != shape->n_profiles) {
|
if (n_profiles != shape->n_profiles) {
|
||||||
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
|
snprintf(errbuf, errbuf_len - 1,
|
||||||
|
"task profiles mismatch (n_profiles)");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: profiles order is relevant, too strict.
|
// TODO: profiles order is relevant, too strict.
|
||||||
size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
|
// Only validate stages!
|
||||||
if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
|
size_t sz = sizeof(struct ggml_task_stage) * 3;
|
||||||
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
|
bool matched = true;
|
||||||
|
for (int j = 0; j < n_profiles; j++) {
|
||||||
|
if (builtin_profiles[j].id != shape->profiles[j].id) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
|
||||||
|
sz) != 0) {
|
||||||
|
matched = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!matched) {
|
||||||
|
snprintf(errbuf, errbuf_len - 1,
|
||||||
|
"task profiles mismatch (profiles)");
|
||||||
|
|
||||||
printf("=== built-in profiles:\n");
|
printf("=== built-in profiles:\n");
|
||||||
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
|
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
|
||||||
@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
|
|||||||
|
|
||||||
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
for (int ip = 0; ip < shape->n_profiles; ip++) {
|
||||||
struct ggml_task_profile *profile = &shape->profiles[ip];
|
struct ggml_task_profile *profile = &shape->profiles[ip];
|
||||||
|
|
||||||
|
rc = fscanf(fp, "%d ", &profile->id);
|
||||||
|
if (rc <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (int j = 0; j < 3; j++) {
|
for (int j = 0; j < 3; j++) {
|
||||||
struct ggml_task_stage *ts = &profile->stages[j];
|
struct ggml_task_stage *ts = &profile->stages[j];
|
||||||
int backend;
|
int backend;
|
||||||
@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
GGML_ASSERT(params);
|
GGML_ASSERT(params);
|
||||||
GGML_ASSERT(params->model.name);
|
GGML_ASSERT(params->model.name);
|
||||||
|
|
||||||
|
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
|
||||||
|
|
||||||
enum ggml_task_backend backends[16];
|
enum ggml_task_backend backends[16];
|
||||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||||
if (n_backends < 2) {
|
if (n_backends < 2) {
|
||||||
@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
|
||||||
|
params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
printf("[tune] error: cl implementation does not support k_quants at "
|
||||||
|
"the time of writing this code, skip.\n");
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
|
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
return false;
|
return false;
|
||||||
@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
||||||
tune->n_threads, ggml_threading_graph_compute_thread,
|
tune->n_threads, NULL, ggml_compute_forward_wrapper,
|
||||||
ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
|
GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
|
||||||
stages_time);
|
|
||||||
|
|
||||||
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
|
||||||
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_MULMAT_TUNE_VERSION 8
|
#define GGML_MULMAT_TUNE_VERSION 9
|
||||||
#define GGML_MULMAT_N_SHAPES 4
|
#define GGML_MULMAT_N_SHAPES 4
|
||||||
#define GGML_MULMAT_CACHE_LEN 16
|
#define GGML_MULMAT_CACHE_LEN 16
|
||||||
|
|
||||||
@ -98,10 +98,11 @@ struct ggml_mulmat_tune_params {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: stages_time is filled if not null.
|
// NOTE: stages_time is filled if not null.
|
||||||
const struct ggml_task_profile *
|
// Return profile id.
|
||||||
ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
|
||||||
int N, int K, enum ggml_type src0_t,
|
int N, int K, enum ggml_type src0_t,
|
||||||
enum ggml_type src1_t, int stages_time[3]);
|
enum ggml_type src1_t,
|
||||||
|
int stages_time[3]);
|
||||||
|
|
||||||
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
|
||||||
const char *model_name, int ftype,
|
const char *model_name, int ftype,
|
||||||
|
355
ggml.c
355
ggml.c
@ -8500,8 +8500,6 @@ static void ggml_compute_forward_mul_f32(
|
|||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
|
||||||
#ifdef GGML_USE_CLBLAST
|
#ifdef GGML_USE_CLBLAST
|
||||||
if (src1->backend == GGML_BACKEND_GPU) {
|
if (src1->backend == GGML_BACKEND_GPU) {
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
@ -8509,10 +8507,7 @@ static void ggml_compute_forward_mul_f32(
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
#endif
|
||||||
};
|
|
||||||
|
|
||||||
const int64_t nr = ggml_nrows(src0);
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
@ -9938,7 +9933,7 @@ static void ggml_compute_forward_rms_norm_back(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CPU only
|
||||||
static void ggml_compute_forward_mul_mat_f32(
|
static void ggml_compute_forward_mul_mat_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -10010,18 +10005,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
|
||||||
GGML_ASSERT(params->nth == 1);
|
|
||||||
GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
|
|
||||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
|
||||||
return;
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
||||||
@ -10104,6 +10087,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CPU only.
|
||||||
static void ggml_compute_forward_mul_mat_f16_f32(
|
static void ggml_compute_forward_mul_mat_f16_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -10168,19 +10152,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
|
enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
|
||||||
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
|
||||||
GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
|
|
||||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
|
||||||
return;
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
|
|
||||||
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
||||||
@ -10304,6 +10278,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CPU only
|
||||||
static void ggml_compute_forward_mul_mat_q_f32(
|
static void ggml_compute_forward_mul_mat_q_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -10373,20 +10348,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
|
||||||
GGML_ASSERT(params->nth == 1);
|
|
||||||
GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
|
|
||||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
|
||||||
return;
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
|
enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
|
||||||
|
enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
||||||
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
||||||
@ -14294,30 +14257,9 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
|
||||||
static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(params);
|
GGML_ASSERT(params);
|
||||||
|
|
||||||
enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend;
|
|
||||||
|
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
|
||||||
if (skip_cpu) {
|
|
||||||
return GGML_COMPUTE_OK;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
|
||||||
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
|
||||||
return GGML_COMPUTE_FALLBACK;
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) {
|
|
||||||
// printf("mulmat: test fallback\n");
|
|
||||||
// return GGML_COMPUTE_FALLBACK;
|
|
||||||
// }
|
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
@ -14568,13 +14510,6 @@ static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params *
|
|||||||
return GGML_COMPUTE_OK;
|
return GGML_COMPUTE_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params,
|
|
||||||
struct ggml_tensor *tensor) {
|
|
||||||
// We call ggml_compute_forward because the CUDA mul_mat entry point
|
|
||||||
// was moved out of `ggml_compute_forward_mul_mat`.
|
|
||||||
return ggml_compute_forward(params, tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
|
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
|
||||||
@ -15524,12 +15459,67 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|||||||
|
|
||||||
// ---- task profiles ----
|
// ---- task profiles ----
|
||||||
|
|
||||||
|
// TODO: replace with ggml_compute_forward_cuda
|
||||||
|
// DO NOT check matrix size further.
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
static enum ggml_compute_error ggml_compute_forward_cuda(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor));
|
||||||
|
if (ggml_cuda_compute_forward(params, tensor)) {
|
||||||
|
return GGML_COMPUTE_OK;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
||||||
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
||||||
|
return GGML_COMPUTE_FALLBACK;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: replace with ggml_cl_mul_mat.
|
||||||
|
// DO NOT check matrix size further.
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
static enum ggml_compute_error ggml_compute_forward_cl(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * tensor) {
|
||||||
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor));
|
||||||
|
ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
|
||||||
|
return GGML_COMPUTE_OK;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) {
|
||||||
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// The wrapper for external mulmat tune tool.
|
||||||
|
enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
|
||||||
|
struct ggml_tensor *tensor) {
|
||||||
|
// We call ggml_compute_forward because the CUDA mul_mat entry point
|
||||||
|
// was moved out of `ggml_compute_forward_mul_mat`.
|
||||||
|
return ggml_compute_forward(params, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
// Implement `ggml_task_profiles_provider`.
|
// Implement `ggml_task_profiles_provider`.
|
||||||
// Fill `profiles` for the `node` and return number of profiles.
|
// Fill `profiles` for the `node` and return number of profiles.
|
||||||
//
|
//
|
||||||
// NOTE: the node may be incompleted from testing or tunning, so please assert
|
// NOTE: the node may be incompleted from testing or tunning, so please assert
|
||||||
// everything used here.
|
// everything used here.
|
||||||
inline int ggml_get_task_profiles(
|
//
|
||||||
|
// TODO: configure cuda for none mul_mat nodes.
|
||||||
|
int ggml_get_task_profiles(
|
||||||
struct ggml_tensor *node,
|
struct ggml_tensor *node,
|
||||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
|
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
|
||||||
GGML_ASSERT(node);
|
GGML_ASSERT(node);
|
||||||
@ -15595,6 +15585,8 @@ inline int ggml_get_task_profiles(
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_OUT_PROD: {
|
case GGML_OP_OUT_PROD: {
|
||||||
|
// CPU only profiles.
|
||||||
|
// CUDA/CL: see end of function.
|
||||||
GGML_ASSERT(node->src0);
|
GGML_ASSERT(node->src0);
|
||||||
GGML_ASSERT(node->src1);
|
GGML_ASSERT(node->src1);
|
||||||
|
|
||||||
@ -15614,16 +15606,6 @@ inline int ggml_get_task_profiles(
|
|||||||
p[i].stages[1].wait = true;
|
p[i].stages[1].wait = true;
|
||||||
i++;
|
i++;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#endif
|
|
||||||
} else if (src0_t == GGML_TYPE_F16) {
|
} else if (src0_t == GGML_TYPE_F16) {
|
||||||
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||||
@ -15635,16 +15617,6 @@ inline int ggml_get_task_profiles(
|
|||||||
p[i].stages[1].wait = true;
|
p[i].stages[1].wait = true;
|
||||||
i++;
|
i++;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#endif
|
|
||||||
} else if (ggml_is_quantized(src0_t)) {
|
} else if (ggml_is_quantized(src0_t)) {
|
||||||
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
|
||||||
@ -15658,16 +15630,6 @@ inline int ggml_get_task_profiles(
|
|||||||
p[i].stages[1].wait = true;
|
p[i].stages[1].wait = true;
|
||||||
i++;
|
i++;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
|
|
||||||
p[i].stages[1].wait = true;
|
|
||||||
i++;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
n_profiles = i;
|
n_profiles = i;
|
||||||
} break;
|
} break;
|
||||||
@ -15757,7 +15719,43 @@ inline int ggml_get_task_profiles(
|
|||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
switch (node->op) {
|
||||||
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_MUL:
|
||||||
|
case GGML_OP_SILU:
|
||||||
|
case GGML_OP_RMS_NORM:
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_ROPE: {
|
||||||
|
int i = n_profiles;
|
||||||
|
p[i].runner = ggml_compute_forward_cuda;
|
||||||
|
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
|
||||||
|
p[i].stages[1].wait = true;
|
||||||
|
++n_profiles;
|
||||||
|
} break;
|
||||||
|
default: {
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
switch (node->op) {
|
||||||
|
case GGML_OP_MUL_MAT: {
|
||||||
|
int i = n_profiles;
|
||||||
|
p[i].runner = ggml_compute_forward_cl;
|
||||||
|
p[i].get_wsize = ggml_compute_forward_get_wsize_cl;
|
||||||
|
p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
|
||||||
|
p[i].stages[1].wait = true;
|
||||||
|
++n_profiles;
|
||||||
|
} break;
|
||||||
|
default: {
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
|
GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
|
||||||
|
for (int i = 0; i < n_profiles; i++) {
|
||||||
|
profiles[i].id = i + 1;
|
||||||
|
}
|
||||||
return n_profiles;
|
return n_profiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -15769,7 +15767,7 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
|
|||||||
GGML_ASSERT(node);
|
GGML_ASSERT(node);
|
||||||
GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
|
GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
|
||||||
GGML_ASSERT(profiles);
|
GGML_ASSERT(profiles);
|
||||||
GGML_ASSERT(n_profiles >= 2);
|
GGML_ASSERT(n_profiles > 0);
|
||||||
|
|
||||||
enum ggml_type src0_t = node->src0->type;
|
enum ggml_type src0_t = node->src0->type;
|
||||||
enum ggml_type src1_t = node->src1->type;
|
enum ggml_type src1_t = node->src1->type;
|
||||||
@ -15790,12 +15788,16 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
|
|||||||
if (cond_match) {
|
if (cond_match) {
|
||||||
#if defined(GGML_USE_TUNE)
|
#if defined(GGML_USE_TUNE)
|
||||||
if (tune != NULL) {
|
if (tune != NULL) {
|
||||||
prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
|
GGML_ASSERT(n_profiles >= 2);
|
||||||
|
int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
|
||||||
src1_t, stages_time_us);
|
src1_t, stages_time_us);
|
||||||
if (prof != NULL) {
|
for (int i = 0; i < n_profiles; i++) {
|
||||||
|
if (profiles[i].id == id) {
|
||||||
|
prof = &profiles[i];
|
||||||
return prof;
|
return prof;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
UNUSED(tune);
|
UNUSED(tune);
|
||||||
UNUSED(stages_time_us);
|
UNUSED(stages_time_us);
|
||||||
@ -15841,35 +15843,23 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
|
|||||||
return prof;
|
return prof;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node,
|
||||||
int n_threads = cgraph->n_threads;
|
struct ggml_cgraph *cgraph) {
|
||||||
|
// Pre-specified.
|
||||||
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
for (int i = 0; i < 3; i++) {
|
||||||
n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
|
if (node->task_profile.stages[i].backend > 0) {
|
||||||
GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
|
return;
|
||||||
|
}
|
||||||
// initialize tasks + work buffer
|
|
||||||
{
|
|
||||||
// int64_t t0 = ggml_time_us();
|
|
||||||
|
|
||||||
size_t work_size = 0;
|
|
||||||
|
|
||||||
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
|
||||||
|
|
||||||
// thread scheduling for the different operations
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
|
||||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
|
||||||
int n_profiles = ggml_get_task_profiles(node, profiles);
|
int n_profiles = ggml_get_task_profiles(node, profiles);
|
||||||
|
|
||||||
const struct ggml_task_profile *profile = NULL;
|
const struct ggml_task_profile *profile = NULL;
|
||||||
|
|
||||||
// Adapt node->backend: assume GPU at COMPUTE stage.
|
// GPU offloading. A special case of pre-specified task_profile.
|
||||||
if (node->backend == GGML_BACKEND_GPU ||
|
if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
node->backend == GGML_BACKEND_GPU_SPLIT) {
|
if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) {
|
||||||
enum ggml_task_backend be;
|
enum ggml_task_backend be;
|
||||||
if (ggml_cpu_has_cublas()) {
|
if (ggml_cpu_has_cublas()) {
|
||||||
be = GGML_TASK_BACKEND_GPU_CUDA;
|
be = GGML_TASK_BACKEND_GPU_CUDA;
|
||||||
@ -15886,23 +15876,28 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(profile);
|
GGML_ASSERT(profile);
|
||||||
} else {
|
GGML_ASSERT(!cgraph->tune);
|
||||||
GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
|
|
||||||
|
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool profile_copied = false;
|
// mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD.
|
||||||
|
|
||||||
if (node->op == GGML_OP_MUL_MAT) {
|
if (node->op == GGML_OP_MUL_MAT) {
|
||||||
#if defined(GGML_USE_TUNE)
|
#if defined(GGML_USE_TUNE)
|
||||||
|
GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
|
||||||
|
|
||||||
int stages_time_us[3];
|
int stages_time_us[3];
|
||||||
profile = ggml_mulmat_get_task_profile(
|
profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles,
|
||||||
node, profiles, n_profiles, cgraph->tune, stages_time_us);
|
cgraph->tune, stages_time_us);
|
||||||
GGML_ASSERT(profile);
|
GGML_ASSERT(profile);
|
||||||
|
|
||||||
|
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
|
||||||
|
|
||||||
if (cgraph->tune) {
|
if (cgraph->tune) {
|
||||||
memcpy(&node->task_profile, profile,
|
memcpy(&node->task_profile, profile,
|
||||||
sizeof(struct ggml_task_profile));
|
sizeof(struct ggml_task_profile));
|
||||||
profile_copied = true;
|
|
||||||
|
|
||||||
// Do not wait if the estimated execution time is too small
|
// Do not wait if the estimated execution time is too small
|
||||||
// (e.g. less than 0.1 ms)
|
// (e.g. less than 0.1 ms)
|
||||||
@ -15916,45 +15911,69 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
#else
|
#else
|
||||||
profile = ggml_mulmat_get_task_profile(node, profiles,
|
profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
|
||||||
n_profiles, NULL, NULL);
|
NULL);
|
||||||
GGML_ASSERT(profile);
|
GGML_ASSERT(profile);
|
||||||
|
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
|
||||||
|
return;
|
||||||
#endif
|
#endif
|
||||||
} else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
|
} else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct?
|
||||||
profile = ggml_mulmat_get_task_profile(node, profiles,
|
profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
|
||||||
n_profiles, NULL, NULL);
|
NULL);
|
||||||
GGML_ASSERT(profile);
|
GGML_ASSERT(profile);
|
||||||
} else {
|
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
|
||||||
profile = &profiles[0];
|
return;
|
||||||
GGML_ASSERT(profile->stages[1].backend ==
|
|
||||||
GGML_TASK_BACKEND_CPU);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!profile_copied) {
|
// default.
|
||||||
memcpy(&node->task_profile, profile,
|
profile = &profiles[0];
|
||||||
sizeof(struct ggml_task_profile));
|
GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU);
|
||||||
|
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||||
|
int n_threads = cgraph->n_threads;
|
||||||
|
|
||||||
|
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
|
||||||
|
n_threads, NULL, ggml_compute_forward,
|
||||||
|
GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
|
||||||
|
|
||||||
|
// initialize tasks + work buffer
|
||||||
|
{
|
||||||
|
// int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
|
size_t work_size = 0;
|
||||||
|
|
||||||
|
// thread scheduling for the different operations
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
GGML_ASSERT (node->op != GGML_OP_NONE);
|
||||||
|
|
||||||
struct ggml_task_stage *stages = node->task_profile.stages;
|
struct ggml_task_stage *stages = node->task_profile.stages;
|
||||||
|
|
||||||
// Workrounnd to set node->backend.
|
ggml_graph_compute_set_tensor_task_proile(node, cgraph);
|
||||||
for (int j = 0; j < 3; j++) {
|
|
||||||
if (node->backend == GGML_BACKEND_CPU &&
|
//
|
||||||
(stages[j].backend & GGML_TASK_BACKEND_GPU)) {
|
// Allocate temp buffer `wdata` for CPU.
|
||||||
if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) {
|
// NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
|
||||||
node->backend = GGML_BACKEND_GPU;
|
//
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
if (node->task_profile.get_wsize) {
|
||||||
}
|
int sz = node->task_profile.get_wsize(node);
|
||||||
|
if (sz >= 0) {
|
||||||
|
work_size = MAX(work_size, (size_t)sz);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend);
|
||||||
|
|
||||||
// compute stage n_tasks.
|
// compute stage n_tasks.
|
||||||
int n_tasks = stages[1].parallel ? n_threads : 1;
|
int n_tasks = stages[1].parallel ? n_threads : 1;
|
||||||
|
|
||||||
// Allocate temp buffer `wdata` for CPU.
|
|
||||||
// NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
@ -16012,20 +16031,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
{
|
{
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_OUT_PROD: // FIXME: is is right?
|
case GGML_OP_OUT_PROD: // FIXME: is this correct?
|
||||||
{
|
{
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
|
enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
|
||||||
GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
|
GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
|
||||||
|
if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
|
||||||
GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node));
|
|
||||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
|
||||||
#else
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
#endif
|
|
||||||
} else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
|
|
||||||
GGML_ASSERT(ggml_cpu_has_cpublas());
|
GGML_ASSERT(ggml_cpu_has_cpublas());
|
||||||
GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
@ -16039,11 +16050,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
} else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
|
} else { // CPU or GPU fallback
|
||||||
if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
|
|
||||||
GGML_ASSERT(ggml_cpu_has_cublas());
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
if (node->src0->type == GGML_TYPE_F32) {
|
if (node->src0->type == GGML_TYPE_F32) {
|
||||||
@ -16056,8 +16063,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
29
ggml.h
29
ggml.h
@ -390,11 +390,40 @@ extern "C" {
|
|||||||
bool wait;
|
bool wait;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_tensor;
|
||||||
|
struct ggml_compute_params;
|
||||||
|
|
||||||
|
// Compute errors.
|
||||||
|
enum ggml_compute_error {
|
||||||
|
GGML_COMPUTE_OK = 0,
|
||||||
|
GGML_COMPUTE_FALLBACK = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
// The task runner to be called by main thread and workers.
|
||||||
|
typedef enum ggml_compute_error(ggml_task_runner)(
|
||||||
|
const struct ggml_compute_params *params,
|
||||||
|
struct ggml_tensor *node);
|
||||||
|
|
||||||
|
// Get wsize for node computing.
|
||||||
|
// When return -1: should be explained as `fallback to CPU`, caller MUST
|
||||||
|
// determine how much memory to reserve for this node.
|
||||||
|
typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
|
||||||
|
|
||||||
// config for computing a tensor.
|
// config for computing a tensor.
|
||||||
struct ggml_task_profile {
|
struct ggml_task_profile {
|
||||||
|
// profile id, start from 1.
|
||||||
|
int id;
|
||||||
|
|
||||||
// index 0: INIT, 1: COMPUTE, 2: FINALIZE
|
// index 0: INIT, 1: COMPUTE, 2: FINALIZE
|
||||||
struct ggml_task_stage stages[3];
|
struct ggml_task_stage stages[3];
|
||||||
|
|
||||||
|
// Optional task runner, overrides threading's task runner.
|
||||||
|
ggml_task_runner *runner;
|
||||||
|
|
||||||
|
// Optional function to return required wsize for wdata.
|
||||||
|
ggml_task_get_wsize *get_wsize;
|
||||||
|
|
||||||
|
// Optional flag for development.
|
||||||
// MUST be used only in testing codes.
|
// MUST be used only in testing codes.
|
||||||
uint8_t dev_flags[4];
|
uint8_t dev_flags[4];
|
||||||
};
|
};
|
||||||
|
@ -42,7 +42,8 @@ static const int n_repeat = 10;
|
|||||||
static int work_done_arr[MAX_N_THREADS];
|
static int work_done_arr[MAX_N_THREADS];
|
||||||
|
|
||||||
static enum ggml_compute_error
|
static enum ggml_compute_error
|
||||||
mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
|
mock_task_runner(const struct ggml_compute_params *params,
|
||||||
|
struct ggml_tensor *node) {
|
||||||
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
|
int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
|
||||||
if (node->task_profile.stages[params->type].parallel) {
|
if (node->task_profile.stages[params->type].parallel) {
|
||||||
loops /= params->nth;
|
loops /= params->nth;
|
||||||
@ -79,9 +80,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
|
|
||||||
int t0 = (int)ggml_time_us();
|
int t0 = (int)ggml_time_us();
|
||||||
|
|
||||||
struct ggml_threading_context *ctx =
|
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||||
ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
|
n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
|
||||||
mock_task_runner, features, /*stages_time*/ NULL);
|
|
||||||
|
|
||||||
int t1 = (int)ggml_time_us();
|
int t1 = (int)ggml_time_us();
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_compute_error
|
static enum ggml_compute_error
|
||||||
mock_task_runner_fallback(struct ggml_compute_params *params,
|
mock_task_runner_fallback(const struct ggml_compute_params *params,
|
||||||
struct ggml_tensor *node) {
|
struct ggml_tensor *node) {
|
||||||
UNUSED(params);
|
UNUSED(params);
|
||||||
if (node->backend == GGML_BACKEND_GPU) {
|
if (node->backend == GGML_BACKEND_GPU) {
|
||||||
@ -158,7 +158,7 @@ mock_task_runner_fallback(struct ggml_compute_params *params,
|
|||||||
// thus it is not parallelled.
|
// thus it is not parallelled.
|
||||||
int test_fallback(struct ggml_tensor *node) {
|
int test_fallback(struct ggml_tensor *node) {
|
||||||
struct ggml_threading_context *ctx = ggml_threading_start(
|
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||||
1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
|
1, NULL, mock_task_runner_fallback,
|
||||||
/*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
|
/*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
|
||||||
|
|
||||||
enum ggml_compute_error err =
|
enum ggml_compute_error err =
|
||||||
@ -177,6 +177,38 @@ int test_fallback(struct ggml_tensor *node) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static enum ggml_compute_error
|
||||||
|
customized_node_runner(const struct ggml_compute_params *params,
|
||||||
|
struct ggml_tensor *node) {
|
||||||
|
UNUSED(params);
|
||||||
|
// Reset runner thus caller will know it was called.
|
||||||
|
node->task_profile.runner = NULL;
|
||||||
|
return GGML_COMPUTE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test when node->task_profile.runner is not NULL.
|
||||||
|
int test_customized_node_runner(struct ggml_tensor *node) {
|
||||||
|
struct ggml_threading_context *ctx = ggml_threading_start(
|
||||||
|
1, NULL, mock_task_runner,
|
||||||
|
/*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
|
||||||
|
|
||||||
|
node->task_profile.runner = customized_node_runner;
|
||||||
|
enum ggml_compute_error err =
|
||||||
|
ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
|
||||||
|
|
||||||
|
ggml_threading_stop(ctx);
|
||||||
|
if (err != GGML_COMPUTE_OK) {
|
||||||
|
// should not happen.
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->task_profile.runner != NULL) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
@ -367,7 +399,10 @@ int main(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fallback
|
||||||
{
|
{
|
||||||
|
printf("[test-ggml-threading] test fallback ...\n");
|
||||||
|
|
||||||
++n_tests;
|
++n_tests;
|
||||||
|
|
||||||
// required by getting task profiles.
|
// required by getting task profiles.
|
||||||
@ -382,9 +417,21 @@ int main(void) {
|
|||||||
node.src1 = &src1;
|
node.src1 = &src1;
|
||||||
|
|
||||||
node.backend = GGML_BACKEND_GPU;
|
node.backend = GGML_BACKEND_GPU;
|
||||||
|
stages[1].backend = GGML_TASK_BACKEND_GPU;
|
||||||
if (test_fallback(&node) == 0) {
|
if (test_fallback(&node) == 0) {
|
||||||
++n_passed;
|
++n_passed;
|
||||||
printf("\n[test-ggml-threading] test fallback: ok\n\n");
|
printf("[test-ggml-threading] test fallback: ok\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// customized node runner
|
||||||
|
{
|
||||||
|
printf("[test-ggml-threading] test customized node runner ...\n");
|
||||||
|
++n_tests;
|
||||||
|
|
||||||
|
if (test_customized_node_runner(&node) == 0) {
|
||||||
|
++n_passed;
|
||||||
|
printf("[test-ggml-threading] test customized node runner: ok\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +72,9 @@ static int bench(void) {
|
|||||||
// GGML_FTYPE_ALL_F32,
|
// GGML_FTYPE_ALL_F32,
|
||||||
// GGML_FTYPE_MOSTLY_F16,
|
// GGML_FTYPE_MOSTLY_F16,
|
||||||
GGML_FTYPE_MOSTLY_Q4_0,
|
GGML_FTYPE_MOSTLY_Q4_0,
|
||||||
|
#if defined(GGML_USE_K_QUANTS)
|
||||||
GGML_FTYPE_MOSTLY_Q4_K,
|
GGML_FTYPE_MOSTLY_Q4_K,
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
|
int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
|
||||||
@ -132,16 +134,14 @@ int estimate_time_non_zero_NK(void) {
|
|||||||
int time[3]; // 3 profiles.
|
int time[3]; // 3 profiles.
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_mulmat_tune tune = {
|
struct ggml_mulmat_tune tune;
|
||||||
.version = 1,
|
|
||||||
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
|
|
||||||
};
|
|
||||||
|
|
||||||
|
enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
|
||||||
const int m_num = 2;
|
const int m_num = 2;
|
||||||
const int n_threads = 1; // useless.
|
const int n_threads = 1; // useless.
|
||||||
|
|
||||||
struct ggml_mulmat_tune_params params;
|
struct ggml_mulmat_tune_params params;
|
||||||
init_params(¶ms, tune.ftype, m_num, n_threads);
|
init_params(¶ms, ftype, m_num, n_threads);
|
||||||
|
|
||||||
ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);
|
ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user