mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 14:28:58 +01:00
80dd7ff22f
* tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles.
3953 lines
140 KiB
C++
3953 lines
140 KiB
C++
// This file defines tests for various GGML ops and backends.
|
|
// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
|
|
// For the backward pass it asserts that the gradients from backpropagation are consistent
|
|
// with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
|
|
// It is also possible to check the performance ("perf" mode).
|
|
//
|
|
// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
|
|
// and section 3 defines which tests to run.
|
|
// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
|
|
// then go to section 3 and add an instantiation of your struct.
|
|
|
|
|
|
// ##############################
|
|
// ## Section 1: General Setup ##
|
|
// ##############################
|
|
|
|
|
|
#include <ggml.h>
|
|
#include <ggml-cpu.h>
|
|
#include <ggml-alloc.h>
|
|
#include <ggml-backend.h>
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <cfloat>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <cinttypes>
|
|
#include <functional>
|
|
#include <memory>
|
|
#include <random>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <future>
|
|
#include <vector>
|
|
|
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
|
size_t nels = ggml_nelements(tensor);
|
|
std::vector<float> data(nels);
|
|
{
|
|
// parallel initialization
|
|
static const size_t n_threads = std::thread::hardware_concurrency();
|
|
// static RNG initialization (revisit if n_threads stops being constant)
|
|
static std::vector<std::default_random_engine> generators = []() {
|
|
std::random_device rd;
|
|
std::vector<std::default_random_engine> vec;
|
|
vec.reserve(n_threads);
|
|
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
|
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
|
return vec;
|
|
}();
|
|
|
|
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
|
std::uniform_real_distribution<float> distribution(min, max);
|
|
auto & gen = generators[ith];
|
|
for (size_t i = start; i < end; i++) {
|
|
data[i] = distribution(gen);
|
|
}
|
|
};
|
|
|
|
std::vector<std::future<void>> tasks;
|
|
tasks.reserve(n_threads);
|
|
for (size_t i = 0; i < n_threads; i++) {
|
|
size_t start = i*nels/n_threads;
|
|
size_t end = (i+1)*nels/n_threads;
|
|
tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
|
|
}
|
|
for (auto & t : tasks) {
|
|
t.get();
|
|
}
|
|
}
|
|
|
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
|
|
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
|
GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
|
|
|
|
// dummy importance matrix
|
|
std::vector<float> imatrix(tensor->ne[0], 1.0f);
|
|
const float * im = imatrix.data();
|
|
if (!ggml_quantize_requires_imatrix(tensor->type)) {
|
|
// when the imatrix is optional, we want to test both quantization with and without imatrix
|
|
// use one of the random numbers to decide
|
|
if (data[0] > 0.5f*(min + max)) {
|
|
im = nullptr;
|
|
}
|
|
}
|
|
|
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
|
|
{
|
|
// parallel quantization by block
|
|
size_t blck_size = ggml_blck_size(tensor->type);
|
|
size_t n_blocks = nels / blck_size;
|
|
|
|
auto quantize_thread = [&](size_t start, size_t end) {
|
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
|
start * blck_size, end - start, blck_size, im);
|
|
};
|
|
|
|
const size_t min_blocks_per_thread = 1;
|
|
const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
|
|
std::max<size_t>(1, n_blocks / min_blocks_per_thread));
|
|
std::vector<std::future<void>> tasks;
|
|
tasks.reserve(n_threads);
|
|
for (size_t i = 0; i < n_threads; i++) {
|
|
size_t start = i*n_blocks/n_threads;
|
|
size_t end = (i+1)*n_blocks/n_threads;
|
|
tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
|
|
}
|
|
for (auto & t : tasks) {
|
|
t.get();
|
|
}
|
|
}
|
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
|
// This is going to create some weird integers though.
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
|
} else if (tensor->type == GGML_TYPE_I64) {
|
|
// Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
|
|
const size_t nbytes_half = ggml_nbytes(tensor)/2;
|
|
ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
|
|
ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
|
|
} else {
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
}
|
|
|
|
static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
std::vector<float> tv;
|
|
tv.reserve(ggml_nelements(t));
|
|
|
|
std::vector<uint8_t> buf(ggml_nbytes(t));
|
|
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
|
|
|
const auto * tt = ggml_get_type_traits(t->type);
|
|
size_t bs = ggml_blck_size(t->type);
|
|
std::vector<float> vq(ggml_blck_size(t->type));
|
|
bool quantized = ggml_is_quantized(t->type);
|
|
|
|
// access elements by index to avoid gaps in views
|
|
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
|
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
|
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
|
for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
|
|
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
|
if (t->type == GGML_TYPE_F16) {
|
|
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
|
} else if (t->type == GGML_TYPE_BF16) {
|
|
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
|
} else if (t->type == GGML_TYPE_F32) {
|
|
tv.push_back(*(float *) &buf[i]);
|
|
} else if (t->type == GGML_TYPE_I64) {
|
|
tv.push_back((float)*(int64_t *) &buf[i]);
|
|
} else if (t->type == GGML_TYPE_I32) {
|
|
tv.push_back((float)*(int32_t *) &buf[i]);
|
|
} else if (t->type == GGML_TYPE_I16) {
|
|
tv.push_back((float)*(int16_t *) &buf[i]);
|
|
} else if (t->type == GGML_TYPE_I8) {
|
|
tv.push_back((float)*(int8_t *) &buf[i]);
|
|
} else if (quantized) {
|
|
tt->to_float(&buf[i], vq.data(), bs);
|
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
|
} else {
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return tv;
|
|
}
|
|
|
|
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
|
static double nmse(const float * a, const float * b, size_t n) {
|
|
double mse_a_b = 0.0;
|
|
double mse_a_0 = 0.0;
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
float a_i = a[i];
|
|
float b_i = b[i];
|
|
|
|
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
|
mse_a_0 += a_i * a_i;
|
|
}
|
|
|
|
return mse_a_b / mse_a_0;
|
|
}
|
|
|
|
// maximum absolute asymmetry between a and b
|
|
// asymmetry: (a - b) / (a + b)
|
|
// This is more stable than relative error if one of the values fluctuates towards zero.
|
|
// n: number of values to compare.
|
|
// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
|
|
// a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
|
|
static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
|
|
double sum = 0.0f;
|
|
|
|
size_t nvalid = 0;
|
|
for (size_t i = 0; i < n; i++) {
|
|
if (!expected_vals.empty()) {
|
|
bool matches_any = false;
|
|
for (const float & ev : expected_vals) {
|
|
if (fabsf(a[i] - ev) < 1e-3f) {
|
|
matches_any = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!matches_any) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
|
|
|
|
sum += fabsf(asymm);
|
|
nvalid++;
|
|
}
|
|
|
|
return sum/nvalid;
|
|
}
|
|
|
|
// utils for printing the variables of the test cases
|
|
|
|
template<typename T>
|
|
static std::string var_to_str(const T & x) {
|
|
return std::to_string(x);
|
|
}
|
|
|
|
template<typename T, size_t N>
|
|
static std::string var_to_str(const T (&x)[N]) {
|
|
std::string s = "[";
|
|
for (size_t i = 0; i < N; i++) {
|
|
if (i > 0) {
|
|
s += ",";
|
|
}
|
|
s += var_to_str(x[i]);
|
|
}
|
|
s += "]";
|
|
return s;
|
|
}
|
|
|
|
template<typename T, size_t N>
|
|
static std::string var_to_str(const std::array<T, N> & x) {
|
|
std::string s = "[";
|
|
for (size_t i = 0; i < N; i++) {
|
|
if (i > 0) {
|
|
s += ",";
|
|
}
|
|
s += var_to_str(x[i]);
|
|
}
|
|
s += "]";
|
|
return s;
|
|
}
|
|
|
|
static std::string var_to_str(ggml_type type) {
|
|
return ggml_type_name(type);
|
|
}
|
|
|
|
static std::string var_to_str(ggml_op_pool pool) {
|
|
switch (pool) {
|
|
case GGML_OP_POOL_AVG: return "avg";
|
|
case GGML_OP_POOL_MAX: return "max";
|
|
default: return std::to_string(pool);
|
|
}
|
|
}
|
|
|
|
#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
|
|
|
|
#define VARS_TO_STR1(a) VAR_TO_STR(a)
|
|
#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
|
|
#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
|
|
#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
|
|
#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
|
|
#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
|
|
#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
|
|
#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
|
|
#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
|
|
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
|
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
|
#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
|
|
|
|
#ifdef GGML_USE_SYCL
|
|
static bool inline _isinf(float f) {
|
|
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
|
}
|
|
#else
|
|
static bool inline _isinf(float f) { return std::isinf(f); }
|
|
#endif
|
|
|
|
// accept FLT_MAX as infinity
|
|
static bool isinf_or_max(float f) {
|
|
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
|
}
|
|
|
|
static bool ggml_is_view_op(enum ggml_op op) {
|
|
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
|
}
|
|
|
|
enum test_mode {
|
|
MODE_TEST,
|
|
MODE_PERF,
|
|
MODE_GRAD,
|
|
};
|
|
|
|
struct test_case {
|
|
virtual ~test_case() {}
|
|
|
|
virtual std::string op_desc(ggml_tensor * t) {
|
|
return ggml_op_desc(t);
|
|
}
|
|
|
|
virtual std::string vars() {
|
|
return "";
|
|
}
|
|
|
|
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
|
|
|
|
virtual double max_nmse_err() {
|
|
return 1e-7;
|
|
}
|
|
|
|
virtual double max_maa_err() {
|
|
return 1e-4;
|
|
}
|
|
|
|
virtual float grad_eps() {
|
|
return 1e-1f;
|
|
}
|
|
|
|
// If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
|
|
// If true, estimate gradient with 4 points, neglects 5th order derivative and higher.
|
|
virtual bool grad_precise() {
|
|
return false;
|
|
}
|
|
|
|
// Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
|
|
virtual int64_t grad_nmax() {
|
|
return 10000;
|
|
}
|
|
|
|
// No effect if empty.
|
|
// If not empty, skip all gradient checks where the numerical result does not match any of the values.
|
|
// Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
|
|
virtual std::vector<float> grad_expect() {
|
|
return {};
|
|
}
|
|
|
|
virtual void initialize_tensors(ggml_context * ctx) {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
|
|
virtual size_t op_size(ggml_tensor * t) {
|
|
size_t size = ggml_nbytes(t);
|
|
// add source tensors
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
if (t->src[i] != NULL) {
|
|
size += ggml_nbytes(t->src[i]);
|
|
}
|
|
}
|
|
return size;
|
|
}
|
|
|
|
virtual uint64_t op_flops(ggml_tensor * t) {
|
|
GGML_UNUSED(t);
|
|
return 0;
|
|
}
|
|
|
|
ggml_cgraph * gf = nullptr;
|
|
ggml_cgraph * gb = nullptr;
|
|
|
|
static const int sentinel_size = 1024;
|
|
|
|
test_mode mode;
|
|
|
|
std::vector<ggml_tensor *> sentinels;
|
|
|
|
void add_sentinel(ggml_context * ctx) {
|
|
if (mode == MODE_PERF || mode == MODE_GRAD) {
|
|
return;
|
|
}
|
|
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
|
|
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
|
|
sentinels.push_back(sentinel);
|
|
}
|
|
|
|
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
|
|
|
|
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
|
|
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
|
|
add_sentinel(ctx);
|
|
return t;
|
|
}
|
|
|
|
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
|
|
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
|
|
add_sentinel(ctx);
|
|
return t;
|
|
}
|
|
|
|
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
|
|
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
|
|
add_sentinel(ctx);
|
|
return t;
|
|
}
|
|
|
|
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
|
|
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
|
|
add_sentinel(ctx);
|
|
return t;
|
|
}
|
|
|
|
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
|
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
|
|
add_sentinel(ctx);
|
|
return t;
|
|
}
|
|
|
|
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
|
|
mode = MODE_TEST;
|
|
|
|
ggml_init_params params = {
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
|
/* .mem_base = */ NULL,
|
|
/* .no_alloc = */ true,
|
|
};
|
|
ggml_context * ctx = ggml_init(params);
|
|
GGML_ASSERT(ctx);
|
|
|
|
gf = ggml_new_graph(ctx);
|
|
|
|
// pre-graph sentinel
|
|
add_sentinel(ctx);
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
|
fflush(stdout);
|
|
|
|
// check if the backends support the ops
|
|
bool supported = true;
|
|
for (ggml_backend_t backend : {backend1, backend2}) {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
supported = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (!supported) {
|
|
printf("\n");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
// post-graph sentinel
|
|
add_sentinel(ctx);
|
|
|
|
// allocate
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
|
|
if (buf == NULL) {
|
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
|
|
ggml_free(ctx);
|
|
return false;
|
|
}
|
|
|
|
// build graph
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
// add sentinels as graph nodes so that they are checked in the callback
|
|
for (ggml_tensor * sentinel : sentinels) {
|
|
ggml_graph_add_node(gf, sentinel);
|
|
}
|
|
|
|
// randomize tensors
|
|
initialize_tensors(ctx);
|
|
|
|
// compare
|
|
struct callback_userdata {
|
|
bool ok;
|
|
double max_err;
|
|
ggml_backend_t backend1;
|
|
ggml_backend_t backend2;
|
|
};
|
|
|
|
callback_userdata ud {
|
|
true,
|
|
max_nmse_err(),
|
|
backend1,
|
|
backend2
|
|
};
|
|
|
|
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
|
|
callback_userdata * ud = (callback_userdata *) user_data;
|
|
const char * bn1 = ggml_backend_name(ud->backend1);
|
|
const char * bn2 = ggml_backend_name(ud->backend2);
|
|
|
|
if (t1->op == GGML_OP_NONE) {
|
|
// sentinels must be unchanged
|
|
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
|
|
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
|
|
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
|
|
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
|
|
|
|
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
|
|
printf("sentinel mismatch: %s ", t1->name);
|
|
ud->ok = false;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
std::vector<float> f1 = tensor_to_float(t1);
|
|
std::vector<float> f2 = tensor_to_float(t2);
|
|
|
|
for (size_t i = 0; i < f1.size(); i++) {
|
|
// check for nans
|
|
if (std::isnan(f1[i]) || std::isnan(f2[i])) {
|
|
printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
|
|
ud->ok = false;
|
|
return true;
|
|
}
|
|
// check for infs: both must be inf of the same sign, or both must be finite
|
|
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
|
|
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
|
|
if (std::signbit(f1[i]) != std::signbit(f2[i])) {
|
|
printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
|
ud->ok = false;
|
|
return true;
|
|
}
|
|
} else {
|
|
printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
|
ud->ok = false;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
double err = nmse(f1.data(), f2.data(), f1.size());
|
|
if (err > ud->max_err) {
|
|
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
|
|
//for (int i = 0; i < (int) f1.size(); i++) {
|
|
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
|
//}
|
|
//printf("\n");
|
|
//exit(1);
|
|
ud->ok = false;
|
|
}
|
|
return true;
|
|
|
|
GGML_UNUSED(index);
|
|
};
|
|
|
|
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
|
|
|
if (!cmp_ok) {
|
|
printf("compare failed ");
|
|
}
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
ggml_free(ctx);
|
|
|
|
if (ud.ok && cmp_ok) {
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
return true;
|
|
}
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
return false;
|
|
}
|
|
|
|
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
|
mode = MODE_PERF;
|
|
|
|
static const size_t graph_nodes = 8192;
|
|
|
|
ggml_init_params params = {
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
|
|
/* .mem_base = */ NULL,
|
|
/* .no_alloc = */ true,
|
|
};
|
|
ggml_context * ctx = ggml_init(params);
|
|
GGML_ASSERT(ctx);
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
|
fflush(stdout);
|
|
|
|
// check if backends support op
|
|
if (!ggml_backend_supports_op(backend, out)) {
|
|
printf("not supported\n");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
// align while also leaving some margin for variations in parameters
|
|
int align = 8;
|
|
int last = (len + align - 1) / align * align;
|
|
if (last - len < 5) {
|
|
last += align;
|
|
}
|
|
printf("%*s", last - len, "");
|
|
|
|
// allocate
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
if (buf == NULL) {
|
|
printf("failed to allocate tensors\n");
|
|
ggml_free(ctx);
|
|
return false;
|
|
}
|
|
|
|
// randomize tensors
|
|
initialize_tensors(ctx);
|
|
|
|
// build graph
|
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
// warmup run
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
// determine number of runs
|
|
int n_runs;
|
|
if (op_flops(out) > 0) {
|
|
// based on flops
|
|
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
|
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
|
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
|
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
|
|
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
|
} else {
|
|
// based on memory size
|
|
const size_t GB = 1ULL << 30;
|
|
const size_t target_size_cpu = 8 * GB;
|
|
const size_t target_size_gpu = 32 * GB;
|
|
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
|
|
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
|
}
|
|
|
|
// duplicate the op
|
|
for (int i = 1; i < n_runs; i++) {
|
|
ggml_graph_add_node(gf, out);
|
|
}
|
|
|
|
// calculate memory
|
|
size_t mem = n_runs * op_size(out);
|
|
auto tensor_op_size = [](ggml_tensor * t) {
|
|
size_t size = ggml_nbytes(t);
|
|
// add source tensors
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
if (t->src[i] != NULL) {
|
|
size += ggml_nbytes(t->src[i]);
|
|
}
|
|
}
|
|
return size;
|
|
};
|
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
|
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
|
continue;
|
|
}
|
|
mem += tensor_op_size(ggml_graph_node(gf, i));
|
|
}
|
|
|
|
// run
|
|
int64_t total_time_us = 0;
|
|
int64_t total_mem = 0;
|
|
int total_runs = 0;
|
|
do {
|
|
int64_t start_time = ggml_time_us();
|
|
ggml_backend_graph_compute(backend, gf);
|
|
int64_t end_time = ggml_time_us();
|
|
|
|
total_time_us += end_time - start_time;
|
|
total_mem += mem;
|
|
total_runs += n_runs;
|
|
} while (total_time_us < 1000*1000); // run for at least 1 second
|
|
|
|
printf(" %8d runs - %8.2f us/run - ",
|
|
total_runs,
|
|
(double)total_time_us / total_runs);
|
|
|
|
if (op_flops(out) > 0) {
|
|
double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
|
|
auto format_flops = [](double flops) -> std::string {
|
|
char buf[256];
|
|
if (flops >= 1e12) {
|
|
snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
|
|
} else if (flops >= 1e9) {
|
|
snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
|
|
} else if (flops >= 1e6) {
|
|
snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
|
|
} else {
|
|
snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
|
|
}
|
|
return buf;
|
|
};
|
|
printf("%s/run - \033[1;34m%sS\033[0m",
|
|
format_flops(op_flops(out)).c_str(),
|
|
format_flops(flops_per_sec).c_str());
|
|
|
|
} else {
|
|
printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
|
|
op_size(out) / 1024,
|
|
total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
|
}
|
|
printf("\n");
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool eval_grad(ggml_backend_t backend, const char * op_name) {
|
|
mode = MODE_GRAD;
|
|
const std::vector<float> expect = grad_expect();
|
|
|
|
ggml_init_params params = {
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
|
|
/* .mem_base = */ NULL,
|
|
/* .no_alloc = */ true,
|
|
};
|
|
ggml_context * ctx = ggml_init(params);
|
|
GGML_ASSERT(ctx);
|
|
|
|
gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
|
|
gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
|
fflush(stdout);
|
|
|
|
if (out->type != GGML_TYPE_F32) {
|
|
ggml_free(ctx);
|
|
printf("not supported [%s->type != FP32]\n", out->name);
|
|
return true;
|
|
}
|
|
|
|
// check if the backend supports the ops
|
|
bool supported = true;
|
|
bool any_params = false;
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
supported = false;
|
|
break;
|
|
}
|
|
if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
|
|
any_params = true;
|
|
if (t->type != GGML_TYPE_F32) {
|
|
printf("not supported [%s->type != FP32] ", t->name);
|
|
supported = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (!any_params) {
|
|
printf("not supported [%s] \n", op_name);
|
|
supported = false;
|
|
}
|
|
if (!supported) {
|
|
printf("\n");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
int64_t ngrads = 0;
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->flags & GGML_TENSOR_FLAG_PARAM) {
|
|
ngrads += ggml_nelements(t);
|
|
}
|
|
}
|
|
if (ngrads > grad_nmax()) {
|
|
printf("skipping large tensors for speed \n");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
|
|
if (!ggml_is_scalar(out)) {
|
|
out = ggml_sum(ctx, out);
|
|
ggml_set_name(out, "sum_of_out");
|
|
}
|
|
ggml_set_loss(out);
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
ggml_graph_cpy(gf, gb);
|
|
ggml_build_backward_expand(ctx, gf, gb, false);
|
|
if (expect.size() != 1 || expect[0] != 0.0f) {
|
|
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
|
}
|
|
}
|
|
|
|
// TODO: refactor so that this check is only needed once
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
supported = false;
|
|
break;
|
|
}
|
|
if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
|
|
printf("not supported [%s->type != FP32] ", t->name);
|
|
supported = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!supported) {
|
|
printf("\n");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
// allocate
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
if (buf == NULL) {
|
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
|
|
ggml_free(ctx);
|
|
return false;
|
|
}
|
|
|
|
|
|
initialize_tensors(ctx); // Randomizes all tensors (including gradients).
|
|
ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
ggml_backend_graph_compute(backend, gb);
|
|
|
|
bool ok = true;
|
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
|
|
continue;
|
|
}
|
|
|
|
const char * bn = ggml_backend_name(backend);
|
|
const int64_t ne = ggml_nelements(t);
|
|
|
|
std::vector<float> ga = tensor_to_float(t->grad);
|
|
|
|
for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
|
|
// check for nans
|
|
if (!std::isfinite(ga[i])) {
|
|
printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
|
|
ok = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!ok) {
|
|
break;
|
|
}
|
|
|
|
std::vector<float> gn(ne); // gradient numeric
|
|
GGML_ASSERT(ga.size() == gn.size());
|
|
|
|
std::vector<float> x0 = tensor_to_float(t); // original t data
|
|
GGML_ASSERT(ggml_is_scalar(out));
|
|
GGML_ASSERT(out->type == GGML_TYPE_F32);
|
|
|
|
const float eps = grad_eps();
|
|
for (int64_t i = 0; i < ne; ++i) {
|
|
const float xiu = x0[i] + 1.0f*eps; // x, index i, up
|
|
const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
|
|
const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
|
|
const float xid = x0[i] - 1.0f*eps; // x, index i, down
|
|
|
|
float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
|
|
|
|
ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
|
|
ggml_backend_graph_compute(backend, gf);
|
|
ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
|
|
|
|
ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
|
|
ggml_backend_graph_compute(backend, gf);
|
|
ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
|
|
|
|
if (grad_precise()) {
|
|
ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
|
|
ggml_backend_graph_compute(backend, gf);
|
|
ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
|
|
|
|
ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
|
|
ggml_backend_graph_compute(backend, gf);
|
|
ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
|
|
|
|
gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
|
|
} else {
|
|
gn[i] = (fu - fd) / (2.0f*eps);
|
|
}
|
|
|
|
ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
|
|
}
|
|
|
|
const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
|
|
if (err > max_maa_err()) {
|
|
printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
|
|
ok = false;
|
|
break;
|
|
}
|
|
if (!ok) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!ok) {
|
|
printf("compare failed ");
|
|
}
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
ggml_free(ctx);
|
|
|
|
if (ok) {
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
return true;
|
|
}
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
return false;
|
|
}
|
|
};
|
|
|
|
|
|
// ###################################
|
|
// ## Section 2: GGML Op Defintions ##
|
|
// ###################################
|
|
|
|
|
|
// The following is an example showing the bare minimum for creating a test for a GGML op.
|
|
|
|
// GGML_OP_EXAMPLE
|
|
struct test_example : public test_case {
|
|
// Always define these 2 or variants thereof:
|
|
const ggml_type type; // The type of the input tensors.
|
|
const std::array<int64_t, 4> ne; // The shape of the input tensors.
|
|
// For some ops it's necessary to define multiple types or shapes for the inputs.
|
|
// Or they may need additional parameters.
|
|
|
|
// Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
|
|
// In most cases these are just the properties of the struct that you defined above.
|
|
// This is needed for info prints.
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
// Define a constructor for the struct.
|
|
// In most cases it will be sufficient to have the same arguments as the struct has properties
|
|
// and just use initializer lists.
|
|
test_example(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
// Define how a simple GGML compute graph can be constructed for the new GGML op.
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
// Step 1: create input tensors that don't depend on any other tensors:
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(b, "b");
|
|
|
|
// Step 2: use the op that you want to test in the GGML compute graph.
|
|
ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
|
|
ggml_set_name(out, "out");
|
|
|
|
// Step 3: return the output tensor.
|
|
return out;
|
|
}
|
|
// In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
|
|
// immediately after you create the tensors.
|
|
// This is optional and only makes sense if a backward pass has actually been implemented for the new op.
|
|
};
|
|
|
|
|
|
// GGML_OP_UNARY
|
|
struct test_unary : public test_case {
|
|
const ggml_unary_op op;
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
int v; // view (1 : non-contiguous a)
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne_a, v);
|
|
}
|
|
|
|
test_unary(ggml_unary_op op,
|
|
ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
|
|
int v = 0)
|
|
: op(op), type(type), ne_a(ne_a), v(v) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
|
|
op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
|
|
|
|
ggml_tensor * a;
|
|
if (v & 1) {
|
|
auto ne = ne_a; ne[0] *= 3;
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
if (grad_supported) {
|
|
ggml_set_param(ctx, a);
|
|
}
|
|
ggml_set_name(a, "a");
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
ggml_set_name(a, "view_of_a");
|
|
} else {
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
if (grad_supported) {
|
|
ggml_set_param(ctx, a);
|
|
}
|
|
ggml_set_name(a, "a");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_unary(ctx, a, op);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
// test extended range of values to check for NaNs in GELU
|
|
init_tensor_uniform(t, -150.f, 150.f);
|
|
}
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 15.0f;
|
|
}
|
|
|
|
std::vector<float> grad_expect() override {
|
|
if (op == GGML_UNARY_OP_ABS) {
|
|
return {-1.0f, 1.0f};
|
|
}
|
|
if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
|
|
return {0.0f};
|
|
}
|
|
if (op == GGML_UNARY_OP_RELU) {
|
|
return {0.0f, 1.0f};
|
|
}
|
|
return {};
|
|
}
|
|
|
|
};
|
|
|
|
// GGML_OP_GET_ROWS
|
|
struct test_get_rows : public test_case {
|
|
const ggml_type type;
|
|
const int n; // cols
|
|
const int m; // rows
|
|
const int r; // rows to get
|
|
const int b; // batch size
|
|
const bool v; // view (non-contiguous src1)
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR6(type, n, m, r, b, v);
|
|
}
|
|
|
|
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
|
|
: type(type), n(n), m(m), r(r), b(b), v(v) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
|
|
ggml_set_name(in, "in");
|
|
|
|
ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
|
|
ggml_set_name(rows, "rows");
|
|
if (v) {
|
|
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
|
|
ggml_set_name(rows, "view_of_rows");
|
|
}
|
|
|
|
const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
|
|
if (grad_supported) {
|
|
ggml_set_param(ctx, in);
|
|
// rows is a constant input -> no gradients
|
|
}
|
|
|
|
ggml_tensor * out = ggml_get_rows(ctx, in, rows);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I32) {
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
|
// rows
|
|
std::vector<int> data(r*b);
|
|
for (int i = 0; i < r*b; i++) {
|
|
data[i] = rand() % m;
|
|
}
|
|
ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ARGMAX
|
|
struct test_argmax : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_argmax(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 100, 1, 1})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_argmax(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 0.0;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_COUNT_EQUAL
|
|
struct test_count_equal : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_count_equal(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {4, 500, 1, 1})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * a_argmax = ggml_argmax(ctx, a);
|
|
ggml_set_name(a_argmax, "a_argmax");
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(b, "b");
|
|
|
|
ggml_tensor * b_argmax = ggml_argmax(ctx, a);
|
|
ggml_set_name(b_argmax, "b_argmax");
|
|
|
|
ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 0.0;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_REPEAT
|
|
struct test_repeat : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const std::array<int, 4> nr;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
}
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
return ggml_nbytes(t) * 2;
|
|
}
|
|
|
|
test_repeat(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
|
std::array<int, 4> nr = {2, 2, 2, 2})
|
|
: type(type), ne(ne), nr(nr) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
|
ggml_set_name(target, "target");
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, src);
|
|
ggml_set_name(src, "src");
|
|
|
|
ggml_tensor * out = ggml_repeat(ctx, src, target);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_DUP
|
|
struct test_dup : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const std::array<int64_t, 4> permute;
|
|
bool _use_permute;
|
|
|
|
std::string vars() override {
|
|
std::string v = VARS_TO_STR2(type, ne);
|
|
if (_use_permute) v += "," + VAR_TO_STR(permute);
|
|
return v;
|
|
}
|
|
|
|
test_dup(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 20, 1},
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
|
: type(type), ne(ne), permute(permute),
|
|
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, src);
|
|
ggml_set_name(src, "src");
|
|
|
|
if (_use_permute) {
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
|
ggml_set_name(src, "src_permuted");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_dup(ctx, src);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SET
|
|
struct test_set : public test_case {
|
|
const ggml_type type_src;
|
|
const ggml_type type_dst;
|
|
const std::array<int64_t, 4> ne;
|
|
const int dim;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type_src, type_dst, ne, dim);
|
|
}
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
|
}
|
|
|
|
test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
|
|
: type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
|
ggml_set_param(ctx, src);
|
|
ggml_set_name(src, "src");
|
|
|
|
auto ne_dst = ne;
|
|
for (int i = 0; i < dim; ++i) {
|
|
ne_dst[i] *= 2;
|
|
}
|
|
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
|
|
ggml_set_param(ctx, dst);
|
|
ggml_set_name(dst, "dst");
|
|
|
|
size_t offset = 0;
|
|
for (int i = 0; i < dim; ++i) {
|
|
offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
|
|
}
|
|
ggml_tensor * out = ggml_set(ctx, dst, src,
|
|
// The backward pass requires setting a contiguous region:
|
|
src->nb[1], src->nb[2], src->nb[3], offset);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CPY
|
|
struct test_cpy : public test_case {
|
|
const ggml_type type_src;
|
|
const ggml_type type_dst;
|
|
const std::array<int64_t, 4> ne;
|
|
const std::array<int64_t, 4> permute;
|
|
bool _src_use_permute;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type_src, type_dst, ne, permute);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 1e-6;
|
|
}
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
|
}
|
|
|
|
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
|
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
|
|
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
|
ggml_set_param(ctx, src);
|
|
ggml_set_name(src, "src");
|
|
|
|
if (_src_use_permute) {
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
|
ggml_set_name(src, "src_permuted");
|
|
}
|
|
|
|
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
|
ggml_set_name(dst, "dst");
|
|
|
|
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CONT
|
|
struct test_cont : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_cont(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, src);
|
|
ggml_set_name(src, "src");
|
|
|
|
src = ggml_transpose(ctx, src);
|
|
ggml_set_name(src, "src_transposed");
|
|
|
|
ggml_tensor * out = ggml_cont(ctx, src);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ADD
|
|
// GGML_OP_MUL
|
|
// GGML_OP_DIV
|
|
struct test_bin_bcast : public test_case {
|
|
using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
|
|
op_t op;
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const std::array<int, 4> nr;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
}
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
return ggml_nbytes(t) * 3;
|
|
}
|
|
|
|
test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 1, 1},
|
|
std::array<int, 4> nr = {1, 2, 1, 1})
|
|
: op(op), type(type), ne(ne), nr(nr) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(b, "b");
|
|
|
|
// The backward pass supports broadcasting only for GGML_ADD:
|
|
const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
|
|
if (grad_supported) {
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_param(ctx, b);
|
|
}
|
|
|
|
ggml_tensor * out = op(ctx, a, b);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (op == ggml_mul || op == ggml_div) {
|
|
// MUL and DIV have numerical issues around zero:
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return op == ggml_div;
|
|
}
|
|
|
|
double max_maa_err() override {
|
|
return op == ggml_add ? 1e-4 : 1e-3;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ADD1
|
|
struct test_add1 : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_add1(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
|
|
// ggml_set_param(ctx, b); // TODO: implement
|
|
ggml_set_name(b, "b");
|
|
|
|
ggml_tensor * out = ggml_add1(ctx, a, b);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SCALE
|
|
struct test_scale : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
float scale;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, scale);
|
|
}
|
|
|
|
test_scale(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
float scale = 2.0f)
|
|
: type(type), ne(ne), scale(scale) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_scale(ctx, a, scale);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_NORM
|
|
struct test_norm : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
float eps;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
}
|
|
|
|
test_norm(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
|
float eps = 1e-6f)
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_norm(ctx, a, eps);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_RMS_NORM
|
|
struct test_rms_norm : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
float eps;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
}
|
|
|
|
test_rms_norm(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
|
float eps = 1e-6f)
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SSM_CONV
|
|
struct test_ssm_conv : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
}
|
|
|
|
test_ssm_conv(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
|
std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SSM_SCAN
|
|
struct test_ssm_scan : public test_case {
|
|
const ggml_type type;
|
|
|
|
const int64_t d_state;
|
|
const int64_t d_inner;
|
|
const int64_t n_seq_tokens;
|
|
const int64_t n_seqs;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
|
|
}
|
|
|
|
test_ssm_scan(ggml_type type = GGML_TYPE_F32,
|
|
int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
: type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
|
|
ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
|
|
ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_RWKV_WKV6
|
|
struct test_rwkv_wkv6 : public test_case {
|
|
const ggml_type type;
|
|
|
|
const int64_t head_count;
|
|
const int64_t head_size;
|
|
const int64_t n_seq_tokens;
|
|
const int64_t n_seqs;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
|
|
}
|
|
|
|
test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
|
|
int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
: type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
const int64_t n_tokens = n_seq_tokens * n_seqs;
|
|
ggml_tensor * r = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
ggml_tensor * k = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ head_size, 1, head_count, n_tokens }.data());
|
|
ggml_tensor * v = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
|
|
ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
|
|
ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_MUL_MAT
|
|
struct test_mul_mat : public test_case {
|
|
const ggml_type type_a;
|
|
const ggml_type type_b;
|
|
const int64_t m;
|
|
const int64_t n;
|
|
const int64_t k;
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
const std::array<int64_t, 4> per; // permutation of dimensions
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 5e-4;
|
|
}
|
|
|
|
uint64_t op_flops(ggml_tensor * t) override {
|
|
GGML_UNUSED(t);
|
|
return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
|
|
}
|
|
|
|
test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
std::array<int64_t, 2> nr = {2, 2},
|
|
std::array<int64_t, 4> per = {0, 1, 2, 3})
|
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
ggml_tensor * a;
|
|
ggml_tensor * b;
|
|
|
|
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
|
if (npermuted > 0) {
|
|
GGML_ASSERT(npermuted == 2);
|
|
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
|
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
|
|
|
// Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
|
|
const int64_t ne_a[4] = {k, m, bs[0], bs[1]};
|
|
const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
|
|
|
|
a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
|
|
b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_param(ctx, b);
|
|
ggml_set_name(a, "a");
|
|
ggml_set_name(b, "b");
|
|
|
|
a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
|
|
b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
|
|
ggml_set_name(a, "a_permuted");
|
|
ggml_set_name(b, "b_permuted");
|
|
} else {
|
|
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
|
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_param(ctx, b);
|
|
ggml_set_name(a, "a");
|
|
ggml_set_name(b, "b");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_MUL_MAT_ID
|
|
struct test_mul_mat_id : public test_case {
|
|
const ggml_type type_a;
|
|
const ggml_type type_b;
|
|
const int n_mats;
|
|
const int n_used;
|
|
const bool b; // brodcast b matrix
|
|
const int64_t m;
|
|
const int64_t n;
|
|
const int64_t k;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 5e-4;
|
|
}
|
|
|
|
uint64_t op_flops(ggml_tensor * t) override {
|
|
GGML_UNUSED(t);
|
|
return 2 * m * k * n * n_used;
|
|
}
|
|
|
|
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
int n_mats = 8, int n_used = 2, bool b = false,
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32)
|
|
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
|
|
m(m), n(n), k(k) {
|
|
GGML_ASSERT(n_used <= n_mats);
|
|
}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
|
|
ggml_set_name(as, "as");
|
|
|
|
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
|
ggml_set_name(ids, "ids");
|
|
if (n_used != n_mats) {
|
|
ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
|
|
ggml_set_name(ids, "view_of_ids");
|
|
}
|
|
|
|
ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
|
|
ggml_set_name(b, "b");
|
|
|
|
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
std::random_device rd;
|
|
std::default_random_engine rng(rd());
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I32) {
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
|
// ids
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
std::vector<int32_t> data(t->ne[0]);
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
data[i] = i % n_mats;
|
|
}
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
|
}
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// GGML_OP_OUT_PROD
|
|
struct test_out_prod : public test_case {
|
|
const ggml_type type_a;
|
|
const ggml_type type_b;
|
|
const int64_t m;
|
|
const int64_t n;
|
|
const int64_t k;
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
const bool trans_b;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 5e-4;
|
|
}
|
|
|
|
test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
bool trans_b = false)
|
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * b;
|
|
if (trans_b) {
|
|
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
|
|
b = ggml_transpose(ctx, b);
|
|
} else {
|
|
b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
|
|
}
|
|
ggml_set_name(b, "b");
|
|
|
|
ggml_tensor * out = ggml_out_prod(ctx, a, b);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SQR
|
|
struct test_sqr : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_sqr(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_sqr(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SQRT
|
|
struct test_sqrt : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_sqrt(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 3, 3, 2})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_sqrt(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
// fill with positive values
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t, 50.0f, 100.0f);
|
|
}
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 20.0f;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_LOG
|
|
struct test_log : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_log(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_log(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
// log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
|
}
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SIN
|
|
struct test_sin : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_sin(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 2, 2, 2})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_sin(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
|
|
}
|
|
}
|
|
|
|
double max_maa_err() override {
|
|
return 1e-3;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.2f;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_COS
|
|
struct test_cos : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_cos(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 2, 2, 2})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_cos(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
|
|
}
|
|
}
|
|
|
|
double max_maa_err() override {
|
|
return 1e-3;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.2f;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CLAMP
|
|
struct test_clamp : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
float min;
|
|
float max;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type, ne, min, max);
|
|
}
|
|
|
|
test_clamp(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
|
float min = -0.5f, float max = 0.5f)
|
|
: type(type), ne(ne), min(min), max(max) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_clamp(ctx, a, min, max);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 1e-2f;
|
|
}
|
|
|
|
std::vector<float> grad_expect() override {
|
|
return {0.0f, 1.0f};
|
|
}
|
|
};
|
|
|
|
// GGML_OP_DIAG_MASK_INF
|
|
struct test_diag_mask_inf : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const int n_past;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, n_past);
|
|
}
|
|
|
|
test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 10, 3, 2},
|
|
int n_past = 5)
|
|
: type(type), ne(ne), n_past(n_past) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SOFT_MAX
|
|
struct test_soft_max : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const bool mask;
|
|
const float scale;
|
|
const float max_bias;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR5(type, ne, mask, scale, max_bias);
|
|
}
|
|
|
|
// the 1024 test with bias occasionally fails:
|
|
// SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
|
|
virtual double max_nmse_err() override {
|
|
return 1e-6;
|
|
}
|
|
|
|
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
|
bool mask = false,
|
|
float scale = 1.0f,
|
|
float max_bias = 0.0f)
|
|
: type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * mask = nullptr;
|
|
if (this->mask) {
|
|
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
|
ggml_set_name(mask, "mask");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
|
|
// GGML_OP_ROPE
|
|
struct test_rope : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
int n_dims;
|
|
int mode;
|
|
int n_ctx; // used to generate positions
|
|
float fs; // freq_scale
|
|
float ef; // ext_factor
|
|
float af; // attn_factor
|
|
bool ff;
|
|
int v; // view (1 : non-contiguous a)
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
|
}
|
|
|
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
|
|
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
|
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a;
|
|
if (v & 1) {
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
ggml_set_name(a, "view_of_a");
|
|
} else {
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
}
|
|
|
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
|
ggml_set_name(pos, "pos");
|
|
|
|
ggml_tensor * freq = nullptr;
|
|
if (ff) {
|
|
freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
|
|
ggml_set_name(freq, "freq");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I32) {
|
|
// pos
|
|
std::vector<int> data(ne_a[2]);
|
|
for (int i = 0; i < ne_a[2]; i++) {
|
|
data[i] = rand() % n_ctx;
|
|
}
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
|
} else {
|
|
if (t->ne[0] == n_dims/2) {
|
|
// frequency factors in the range [0.9f, 1.1f]
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
double max_maa_err() override {
|
|
return 1e-3;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_POOL2D
|
|
struct test_pool2d : public test_case {
|
|
enum ggml_op_pool pool_type;
|
|
const ggml_type type_input;
|
|
const std::array<int64_t, 4> ne_input;
|
|
// kernel size
|
|
const int k0;
|
|
const int k1;
|
|
// stride
|
|
const int s0;
|
|
const int s1;
|
|
// padding
|
|
const int p0;
|
|
const int p1;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
|
|
}
|
|
|
|
test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
|
|
ggml_type type_input = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
int k0 = 3, int k1 = 3,
|
|
int s0 = 1, int s1 = 1,
|
|
int p0 = 1, int p1 = 1)
|
|
: pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
|
ggml_set_param(ctx, input);
|
|
ggml_set_name(input, "input");
|
|
|
|
ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CONV_TRANSPOSE_1D
|
|
struct test_conv_transpose_1d : public test_case {
|
|
const std::array<int64_t, 4> ne_input;
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
|
|
const int s0; // stride
|
|
const int p0; // padding
|
|
const int d0; // dilation
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
|
}
|
|
|
|
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
|
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
int s0 = 1, int p0 = 0, int d0 = 1)
|
|
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
|
ggml_set_name(input, "input");
|
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
|
ggml_set_name(kernel, "kernel");
|
|
|
|
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_IM2COL
|
|
struct test_im2col : public test_case {
|
|
const ggml_type type_input;
|
|
const ggml_type type_kernel;
|
|
const ggml_type dst_type;
|
|
const std::array<int64_t, 4> ne_input;
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
// stride
|
|
const int s0;
|
|
const int s1;
|
|
// padding
|
|
const int p0;
|
|
const int p1;
|
|
// dilation
|
|
const int d0;
|
|
const int d1;
|
|
// mode
|
|
const bool is_2D;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
|
}
|
|
|
|
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
int s0 = 1, int s1 = 1,
|
|
int p0 = 1, int p1 = 1,
|
|
int d0 = 1, int d1 = 1,
|
|
bool is_2D = true)
|
|
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
|
ggml_set_param(ctx, input);
|
|
ggml_set_name(input, "input");
|
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
|
ggml_set_name(kernel, "kernel");
|
|
|
|
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CONCAT
|
|
struct test_concat : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const int64_t ne_b_d;
|
|
const int dim;
|
|
const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
|
|
}
|
|
|
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
|
|
int64_t ne_b_d = 5,
|
|
int dim = 2, int v = 0)
|
|
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
auto ne_b = ne_a;
|
|
ne_b[dim] = ne_b_d;
|
|
ggml_tensor * a;
|
|
if (v & 1) {
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
ggml_set_name(a, "view_of_a");
|
|
} else {
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_name(a, "a");
|
|
}
|
|
ggml_tensor * b;
|
|
if (v & 2) {
|
|
auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
|
|
b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(b, "b");
|
|
|
|
b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
|
|
ggml_set_name(b, "view_of_b");
|
|
} else {
|
|
b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
ggml_set_name(b, "b");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ARGSORT
|
|
struct test_argsort : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
ggml_sort_order order;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, order);
|
|
}
|
|
|
|
test_argsort(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
|
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
|
: type(type), ne(ne), order(order) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_argsort(ctx, a, order);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
std::random_device rd;
|
|
std::default_random_engine rng(rd());
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I32) {
|
|
// indices
|
|
std::vector<int> data(ggml_nelements(t));
|
|
for (int i = 0; i < ggml_nelements(t); i++) {
|
|
data[i] = rand();
|
|
}
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
|
|
} else if (t->type == GGML_TYPE_F32) {
|
|
// initialize with unique values to avoid ties
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
std::vector<float> data(t->ne[0]);
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
data[i] = i;
|
|
}
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
|
}
|
|
} else {
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SUM
|
|
struct test_sum : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_sum(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_sum(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
|
|
}
|
|
};
|
|
|
|
// GGML_OP_SUM_ROWS
|
|
struct test_sum_rows : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_sum_rows(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_sum_rows(ctx, a);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_UPSCALE
|
|
struct test_upscale : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const int32_t scale_factor;
|
|
const bool transpose;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
|
}
|
|
|
|
test_upscale(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
|
int32_t scale_factor = 2, bool transpose = false)
|
|
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
if (transpose) {
|
|
a = ggml_transpose(ctx, a);
|
|
ggml_set_name(a, "a_transposed");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_UPSCALE (ext)
|
|
struct test_upscale_ext : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const std::array<int64_t, 4> ne_tgt;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, ne_tgt);
|
|
}
|
|
|
|
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
|
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
|
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_GROUP_NORM
|
|
struct test_group_norm : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const int32_t num_groups;
|
|
const float eps;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne, num_groups);
|
|
}
|
|
|
|
test_group_norm(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {64, 64, 320, 1},
|
|
int32_t num_groups = 32,
|
|
float eps = 1e-6f)
|
|
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ACC
|
|
struct test_acc : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
}
|
|
|
|
test_acc(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
|
|
std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_param(ctx, a);
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
ggml_set_param(ctx, b);
|
|
ggml_set_name(b, "b");
|
|
|
|
ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_PAD
|
|
struct test_pad : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const int pad_0;
|
|
const int pad_1;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
|
|
}
|
|
|
|
test_pad(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
|
|
int pad_0 = 1, int pad_1 = 1)
|
|
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_ARANGE
|
|
struct test_arange : public test_case {
|
|
const ggml_type type;
|
|
const float start;
|
|
const float stop;
|
|
const float step;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type, start, stop, step);
|
|
}
|
|
|
|
test_arange(ggml_type type = GGML_TYPE_F32,
|
|
float start = 0.f, float stop = 10.f, float step = 1.f)
|
|
: type(type), start(start), stop(stop), step(step) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * out = ggml_arange(ctx, start, stop, step);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_TIMESTEP_EMBEDDING
|
|
struct test_timestep_embedding : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const int dim;
|
|
const int max_period;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR4(type, ne_a, dim, max_period);
|
|
}
|
|
|
|
test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
|
|
int dim = 320, int max_period=10000)
|
|
: type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_LEAKY_RELU
|
|
struct test_leaky_relu : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne_a;
|
|
const float negative_slope;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR3(type, ne_a, negative_slope);
|
|
}
|
|
|
|
test_leaky_relu(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
|
|
float negative_slope = 0.1f)
|
|
: type(type), ne_a(ne_a), negative_slope(negative_slope) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_FLASH_ATTN_EXT
|
|
struct test_flash_attn_ext : public test_case {
|
|
const int64_t hs; // head size
|
|
const int64_t nh; // num heads
|
|
const int64_t kv; // kv size
|
|
const int64_t nb; // batch size
|
|
|
|
const bool mask; // use mask
|
|
|
|
const float max_bias; // ALiBi
|
|
const float logit_softcap; // Gemma 2
|
|
|
|
const ggml_type type_KV;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 5e-4;
|
|
}
|
|
|
|
uint64_t op_flops(ggml_tensor * t) override {
|
|
GGML_UNUSED(t);
|
|
// Just counting matmul costs:
|
|
// Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
|
|
return 2 * 2 * nh * nb * hs * kv;
|
|
}
|
|
|
|
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
|
|
bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
|
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
|
|
|
|
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
|
|
ggml_set_name(q, "q");
|
|
|
|
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
|
ggml_set_name(k, "k");
|
|
|
|
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
|
ggml_set_name(v, "v");
|
|
|
|
ggml_tensor * m = nullptr;
|
|
if (mask) {
|
|
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
|
ggml_set_name(m, "m");
|
|
}
|
|
|
|
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_CROSS_ENTROPY_LOSS
|
|
struct test_cross_entropy_loss : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR2(type, ne);
|
|
}
|
|
|
|
test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
: type(type), ne(ne) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
ggml_set_param(ctx, logits);
|
|
ggml_set_name(logits, "logits");
|
|
|
|
ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
// The labels are assumed to be constant -> no gradients.
|
|
ggml_set_name(labels, "labels");
|
|
|
|
// Ensure labels add up to 1:
|
|
labels = ggml_soft_max(ctx, labels);
|
|
ggml_set_name(labels, "labels_normalized");
|
|
|
|
ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
// For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t, -100.0f, 100.0f);
|
|
}
|
|
}
|
|
|
|
float grad_eps() override {
|
|
return 1.0f;
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// GGML_OP_OPT_STEP_ADAMW
|
|
struct test_opt_step_adamw : public test_case {
|
|
const ggml_type type;
|
|
const std::array<int64_t, 4> ne;
|
|
const float alpha;
|
|
const float beta1;
|
|
const float beta2;
|
|
const float eps;
|
|
const float wd;
|
|
|
|
std::string vars() override {
|
|
return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
|
|
}
|
|
|
|
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
|
float alpha = 1e-3f,
|
|
float beta1 = 0.9f,
|
|
float beta2 = 0.999f,
|
|
float eps = 1e-8f,
|
|
float wd = 0.0f)
|
|
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
ggml_set_name(grad, "grad");
|
|
|
|
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
|
|
ggml_set_name(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
|
|
}
|
|
}
|
|
|
|
bool grad_precise() override {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
enum llm_norm_type {
|
|
LLM_NORM,
|
|
LLM_NORM_RMS,
|
|
};
|
|
|
|
struct llama_hparams {
|
|
uint32_t n_vocab;
|
|
uint32_t n_embd;
|
|
uint32_t n_head;
|
|
uint32_t n_head_kv;
|
|
static constexpr uint32_t n_layer = 1;
|
|
uint32_t n_rot;
|
|
uint32_t n_embd_head; // dimension of values (d_v)
|
|
uint32_t n_ff;
|
|
|
|
float f_norm_eps;
|
|
float f_norm_rms_eps;
|
|
|
|
// cparams
|
|
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
|
static constexpr uint32_t n_ctx_orig = n_ctx;
|
|
|
|
// batch
|
|
int32_t n_tokens;
|
|
|
|
// llm_build_context
|
|
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
|
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
|
|
|
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
|
return n_embd_head * n_head_kv;
|
|
}
|
|
};
|
|
|
|
// LLM base class
|
|
struct test_llm : public test_case {
|
|
llama_hparams hp;
|
|
|
|
protected:
|
|
test_llm(llama_hparams hp)
|
|
: hp(std::move(hp)) {
|
|
}
|
|
|
|
public:
|
|
struct ggml_tensor * llm_build_norm(
|
|
struct ggml_context * ctx,
|
|
struct ggml_tensor * cur,
|
|
struct ggml_tensor * mw,
|
|
struct ggml_tensor * mb,
|
|
llm_norm_type type) {
|
|
switch (type) {
|
|
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
|
|
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
|
|
}
|
|
cur = ggml_mul(ctx, cur, mw);
|
|
if (mb) {
|
|
cur = ggml_add(ctx, cur, mb);
|
|
}
|
|
return cur;
|
|
}
|
|
|
|
void llm_build_kv_store(
|
|
struct ggml_context * ctx,
|
|
struct ggml_tensor * k_l,
|
|
struct ggml_tensor * v_l,
|
|
struct ggml_tensor * k_cur,
|
|
struct ggml_tensor * v_cur) {
|
|
// compute the transposed [n_tokens, n_embd] V matrix
|
|
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
|
|
|
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
|
|
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
|
|
|
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
|
|
( hp.n_ctx)*ggml_element_size(v_l),
|
|
(hp.kv_head)*ggml_element_size(v_l));
|
|
|
|
// important: storing RoPE-ed version of K in the KV cache!
|
|
ggml_cpy(ctx, k_cur, k_cache_view);
|
|
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
|
}
|
|
|
|
struct ggml_tensor * llm_build_kqv(
|
|
struct ggml_context * ctx,
|
|
struct ggml_tensor * k_l,
|
|
struct ggml_tensor * v_l,
|
|
struct ggml_tensor * q_cur,
|
|
struct ggml_tensor * kq_mask,
|
|
float kq_scale) {
|
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
|
|
|
struct ggml_tensor * k =
|
|
ggml_view_3d(ctx, k_l,
|
|
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
|
|
ggml_row_size(k_l->type, hp.n_embd_gqa()),
|
|
ggml_row_size(k_l->type, hp.n_embd_head),
|
|
0);
|
|
|
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
|
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
|
|
|
|
// split cached v into n_head heads
|
|
struct ggml_tensor * v =
|
|
ggml_view_3d(ctx, v_l,
|
|
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
|
|
ggml_element_size(v_l)*hp.n_ctx,
|
|
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
|
|
0);
|
|
|
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
|
|
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
|
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
|
|
|
|
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
cur = ggml_mul_mat(ctx, wo, cur);
|
|
|
|
return cur;
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I32) {
|
|
// pos
|
|
std::vector<int> data(hp.n_tokens);
|
|
for (int i = 0; i < hp.n_tokens; i++) {
|
|
data[i] = rand() % hp.n_ctx;
|
|
}
|
|
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Llama
|
|
struct test_llama : public test_llm {
|
|
static constexpr float freq_base = 10000.0f;
|
|
static constexpr float freq_scale = 1.0f;
|
|
static constexpr float ext_factor = 0.0f;
|
|
static constexpr float attn_factor = 1.0f;
|
|
static constexpr float beta_fast = 32.0f;
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
GGML_UNUSED(t);
|
|
return "LLAMA";
|
|
}
|
|
|
|
std::string vars() override {
|
|
auto n_tokens = hp.n_tokens;
|
|
return VARS_TO_STR1(n_tokens);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 2e-3;
|
|
}
|
|
|
|
test_llama(int n_tokens = 1)
|
|
: test_llm({
|
|
/*n_vocab =*/ 32000,
|
|
/*n_embd =*/ 3200,
|
|
/*n_head =*/ 32,
|
|
/*n_head_kv =*/ 32,
|
|
/*n_rot =*/ 100,
|
|
/*n_embd_head =*/ 100,
|
|
/*n_ff =*/ 8640,
|
|
/*f_norm_eps =*/ 0.f,
|
|
/*f_norm_rms_eps =*/ 1e-5f,
|
|
/*n_tokens =*/ n_tokens,
|
|
}) {
|
|
}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
// inp_pos - contains the positions
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
struct ggml_tensor * inpSA = inpL;
|
|
|
|
// norm
|
|
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
// self-attention
|
|
{
|
|
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
|
|
// compute Q and K and RoPE them
|
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
|
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
|
|
|
Qcur = ggml_rope_ext(
|
|
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
);
|
|
|
|
Kcur = ggml_rope_ext(
|
|
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
);
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
}
|
|
|
|
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
|
|
|
|
// feed-forward network
|
|
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
|
|
cur = ggml_mul_mat(ctx, ffn_gate, cur);
|
|
cur = ggml_silu(ctx, cur);
|
|
cur = ggml_mul(ctx, cur, tmp);
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
// input for next layer
|
|
inpL = cur;
|
|
}
|
|
|
|
cur = inpL;
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
// lm_head
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
return cur;
|
|
}
|
|
};
|
|
|
|
// Falcon
|
|
struct test_falcon : public test_llm {
|
|
static constexpr float freq_base = 10000.0f;
|
|
static constexpr float freq_scale = 1.0f;
|
|
static constexpr float ext_factor = 0.0f;
|
|
static constexpr float attn_factor = 1.0f;
|
|
static constexpr float beta_fast = 32.0f;
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
GGML_UNUSED(t);
|
|
return "FALCON";
|
|
}
|
|
|
|
std::string vars() override {
|
|
auto n_tokens = hp.n_tokens;
|
|
return VARS_TO_STR1(n_tokens);
|
|
}
|
|
|
|
double max_nmse_err() override {
|
|
return 2e-3;
|
|
}
|
|
|
|
test_falcon(int n_tokens = 1)
|
|
: test_llm({
|
|
/*n_vocab =*/ 32000,
|
|
/*n_embd =*/ 3200,
|
|
/*n_head =*/ 50,
|
|
/*n_head_kv =*/ 1,
|
|
/*n_rot =*/ 64,
|
|
/*n_embd_head =*/ 64,
|
|
/*n_ff =*/ 8640,
|
|
/*f_norm_eps =*/ 1e-5f,
|
|
/*f_norm_rms_eps =*/ 0.f,
|
|
/*n_tokens =*/ n_tokens,
|
|
}) {
|
|
}
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
// inp_pos - contains the positions
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
// norm
|
|
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
|
|
|
|
// self-attention
|
|
{
|
|
cur = attn_norm;
|
|
|
|
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
|
|
|
|
cur = ggml_mul_mat(ctx, wqkv, cur);
|
|
|
|
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
|
|
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
|
|
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
|
|
|
|
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
|
|
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
|
|
|
// using mode = 2 for neox mode
|
|
Qcur = ggml_rope_ext(
|
|
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
);
|
|
|
|
Kcur = ggml_rope_ext(
|
|
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
);
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
}
|
|
|
|
struct ggml_tensor * ffn_inp = cur;
|
|
|
|
// feed forward
|
|
{
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
cur = attn_norm;
|
|
cur = ggml_mul_mat(ctx, ffn_up, cur);
|
|
cur = ggml_gelu(ctx, cur);
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
}
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
cur = ggml_add(ctx, cur, inpL);
|
|
|
|
// input for next layer
|
|
inpL = cur;
|
|
}
|
|
|
|
cur = inpL;
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
|
|
|
|
// lm_head
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
return cur;
|
|
}
|
|
};
|
|
|
|
|
|
// ###########################################
|
|
// ## Section 3: GGML Op Test Instantiation ##
|
|
// ###########################################
|
|
static const ggml_type all_types[] = {
|
|
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
GGML_TYPE_Q8_0,
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
|
GGML_TYPE_Q6_K,
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
|
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
|
};
|
|
|
|
static const ggml_type base_types[] = {
|
|
GGML_TYPE_F32, GGML_TYPE_F16,
|
|
GGML_TYPE_Q4_0,
|
|
GGML_TYPE_Q4_K,
|
|
GGML_TYPE_IQ2_XXS
|
|
};
|
|
|
|
static const ggml_type other_types[] = {
|
|
GGML_TYPE_Q4_1,
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
GGML_TYPE_Q8_0,
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
GGML_TYPE_Q5_K,
|
|
GGML_TYPE_Q6_K,
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
|
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
|
GGML_TYPE_BF16,
|
|
};
|
|
|
|
// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
|
|
static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
std::vector<std::unique_ptr<test_case>> test_cases;
|
|
std::default_random_engine rng(0);
|
|
|
|
// unary ops
|
|
for (int v : {0, 1}) {
|
|
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v));
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v));
|
|
}
|
|
}
|
|
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
|
for (ggml_type type : all_types) {
|
|
for (int b : {1, 7}) {
|
|
for (bool v : {false, true}) {
|
|
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
|
|
}
|
|
}
|
|
}
|
|
for (int b : {1, 7}) {
|
|
for (bool v : {false, true}) {
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
|
|
}
|
|
}
|
|
|
|
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
|
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
|
for (int k0 : {1, 3}) {
|
|
for (int k1 : {1, 3}) {
|
|
for (int s0 : {1, 2}) {
|
|
for (int s1 : {1, 2}) {
|
|
for (int p0 : {0, 1}) {
|
|
for (int p1 : {0, 1}) {
|
|
test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// im2col 1D
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
for (int s0 : {1, 3}) {
|
|
for (int p0 : {0, 3}) {
|
|
for (int d0 : {1, 3}) {
|
|
test_cases.emplace_back(new test_im2col(
|
|
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
|
|
s0, 0, p0, 0, d0, 0, false));
|
|
}
|
|
}
|
|
}
|
|
|
|
// im2col 2D
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
|
for (int s0 : {1, 3}) {
|
|
for (int s1 : {1, 3}) {
|
|
for (int p0 : {0, 3}) {
|
|
for (int p1 : {0, 3}) {
|
|
for (int d0 : {1, 3}) {
|
|
for (int d1 : {1, 3}) {
|
|
test_cases.emplace_back(new test_im2col(
|
|
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
|
|
s0, s1, p0, p1, d0, d1, true));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// extra tests for im2col 2D
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
|
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
// these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d());
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
|
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_argmax());
|
|
test_cases.emplace_back(new test_count_equal());
|
|
|
|
for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
|
}
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
|
|
|
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
|
|
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
|
|
}
|
|
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
for (ggml_type type_dst : all_types) {
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
}
|
|
}
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
|
}
|
|
}
|
|
|
|
test_cases.emplace_back(new test_cont());
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
|
|
|
|
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
|
|
for (auto op : {ggml_add, ggml_mul, ggml_div}) {
|
|
test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
|
|
}
|
|
};
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2});
|
|
|
|
// stable diffusion
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
|
|
add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
|
|
|
test_cases.emplace_back(new test_add1());
|
|
test_cases.emplace_back(new test_scale());
|
|
|
|
for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
|
|
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
|
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
|
}
|
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
|
|
|
|
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
|
|
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
|
|
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
|
|
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
|
|
|
|
#if 1
|
|
for (ggml_type type_a : base_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
// test cases without permutation
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
|
|
|
// test cases with permutation
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
}
|
|
}
|
|
for (ggml_type type_a : other_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
if (ggml_blck_size(type_a) != 256) {
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
|
|
}
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
|
|
}
|
|
}
|
|
#else
|
|
// m = a rows
|
|
// n = b rows
|
|
// k = cols
|
|
std::uniform_int_distribution<> dist_m(1, 128);
|
|
std::uniform_int_distribution<> dist_n(16, 128);
|
|
std::uniform_int_distribution<> dist_k(1, 16);
|
|
for (int i = 0; i < 1000; i++) {
|
|
for (ggml_type type_a : all_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
int m = dist_m(rng);
|
|
int n = dist_n(rng);
|
|
int k = dist_k(rng) * ggml_blck_size(type_a);
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
|
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
|
|
|
|
for (ggml_type type_a : base_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
|
for (int n_mats : {4, 8}) {
|
|
for (int n_used : {1, 2, 4}) {
|
|
for (bool b : {false, true}) {
|
|
for (int n : {1, 32}) {
|
|
int m = 512;
|
|
int k = 256;
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (ggml_type type_a : other_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
|
for (int n_mats : {4}) {
|
|
for (int n_used : {2}) {
|
|
for (bool b : {false}) {
|
|
for (int n : {1, 32}) {
|
|
int m = 512;
|
|
int k = 256;
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (ggml_type type_a : base_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
}
|
|
}
|
|
|
|
test_cases.emplace_back(new test_sqr());
|
|
test_cases.emplace_back(new test_sqrt());
|
|
test_cases.emplace_back(new test_log());
|
|
test_cases.emplace_back(new test_sin());
|
|
test_cases.emplace_back(new test_cos());
|
|
test_cases.emplace_back(new test_clamp());
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
|
|
|
|
#if 0
|
|
std::uniform_int_distribution<> dist_ne1(1, 50);
|
|
int exponent = 1;
|
|
while (exponent < (1 << 17)) {
|
|
std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
|
|
|
|
for (int n = 0; n < 10; ++n) {
|
|
int64_t ne0 = dist_ne0(rng);
|
|
int64_t ne1 = dist_ne1(rng);
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
|
}
|
|
|
|
exponent <<= 1;
|
|
}
|
|
#endif
|
|
for (bool mask : {false, true}) {
|
|
for (float max_bias : {0.0f, 8.0f}) {
|
|
if (!mask && max_bias > 0.0f) continue;
|
|
for (float scale : {1.0f, 0.1f}) {
|
|
for (int64_t ne0 : {16, 1024}) {
|
|
for (int64_t ne1 : {16, 1024}) {
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
|
|
|
{
|
|
bool all = true;
|
|
|
|
for (float v : { 0, 1 }) {
|
|
for (float fs : { 1.0f, 1.4245f }) {
|
|
for (float ef : { 0.0f, 0.7465f }) {
|
|
for (float af : { 1.0f, 1.4245f }) {
|
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
for (bool ff : {false, true}) { // freq_factors
|
|
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
|
|
|
|
if (all) {
|
|
test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
|
|
test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
|
|
test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
|
|
}
|
|
|
|
if (all) {
|
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
|
}
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
}
|
|
}
|
|
|
|
all = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int v : { 0, 1, 2, 3 }) {
|
|
for (int dim : { 0, 1, 2, 3, }) {
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
|
}
|
|
}
|
|
|
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
|
|
}
|
|
|
|
test_cases.emplace_back(new test_sum());
|
|
test_cases.emplace_back(new test_sum_rows());
|
|
test_cases.emplace_back(new test_upscale());
|
|
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
|
test_cases.emplace_back(new test_upscale_ext());
|
|
test_cases.emplace_back(new test_group_norm());
|
|
test_cases.emplace_back(new test_acc());
|
|
test_cases.emplace_back(new test_pad());
|
|
test_cases.emplace_back(new test_arange());
|
|
test_cases.emplace_back(new test_timestep_embedding());
|
|
test_cases.emplace_back(new test_leaky_relu());
|
|
|
|
for (int hs : { 64, 80, 128, 256, }) {
|
|
for (bool mask : { true, false } ) {
|
|
for (float max_bias : { 0.0f, 8.0f }) {
|
|
if (!mask && max_bias > 0.0f) continue;
|
|
for (float logit_softcap : {0.0f, 10.0f}) {
|
|
if (hs != 128 && logit_softcap != 0.0f) continue;
|
|
for (int nh : { 32, }) {
|
|
for (int kv : { 512, 1024, }) {
|
|
for (int nb : { 1, 3, 32, 35, }) {
|
|
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
test_cases.emplace_back(new test_cross_entropy_loss());
|
|
for (float wd : {0.0f, 1e-2f}) {
|
|
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
|
|
}
|
|
|
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
#if 0
|
|
test_cases.emplace_back(new test_llama(1));
|
|
test_cases.emplace_back(new test_llama(2));
|
|
test_cases.emplace_back(new test_falcon(1));
|
|
test_cases.emplace_back(new test_falcon(2));
|
|
#endif
|
|
|
|
return test_cases;
|
|
}
|
|
|
|
// Test cases for performance evaluation: should be representative of real-world use cases
|
|
static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|
std::vector<std::unique_ptr<test_case>> test_cases;
|
|
|
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
|
|
|
for (int bs : {1, 512}) {
|
|
for (ggml_type type_a : all_types) {
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));
|
|
}
|
|
}
|
|
}
|
|
|
|
return test_cases;
|
|
}
|
|
|
|
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
|
if (mode == MODE_TEST) {
|
|
auto test_cases = make_test_cases_eval();
|
|
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
|
|
|
size_t n_ok = 0;
|
|
for (auto & test : test_cases) {
|
|
if (test->eval(backend, backend_cpu, op_name)) {
|
|
n_ok++;
|
|
}
|
|
}
|
|
printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
|
|
|
|
ggml_backend_free(backend_cpu);
|
|
|
|
return n_ok == test_cases.size();
|
|
}
|
|
|
|
if (mode == MODE_GRAD) {
|
|
auto test_cases = make_test_cases_eval();
|
|
size_t n_ok = 0;
|
|
for (auto & test : test_cases) {
|
|
if (test->eval_grad(backend, op_name)) {
|
|
n_ok++;
|
|
}
|
|
}
|
|
printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
|
|
|
|
return n_ok == test_cases.size();
|
|
}
|
|
|
|
if (mode == MODE_PERF) {
|
|
auto test_cases = make_test_cases_perf();
|
|
for (auto & test : test_cases) {
|
|
test->eval_perf(backend, op_name);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
|
|
static void usage(char ** argv) {
|
|
printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
|
|
printf(" valid modes:\n");
|
|
printf(" - test (default, compare with CPU backend for correctness)\n");
|
|
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
|
printf(" - perf (performance evaluation)\n");
|
|
printf(" op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
|
|
}
|
|
|
|
int main(int argc, char ** argv) {
|
|
test_mode mode = MODE_TEST;
|
|
const char * op_name_filter = NULL;
|
|
const char * backend_filter = NULL;
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
if (strcmp(argv[i], "test") == 0) {
|
|
mode = MODE_TEST;
|
|
} else if (strcmp(argv[i], "perf") == 0) {
|
|
mode = MODE_PERF;
|
|
} else if (strcmp(argv[i], "grad") == 0) {
|
|
mode = MODE_GRAD;
|
|
} else if (strcmp(argv[i], "-o") == 0) {
|
|
if (i + 1 < argc) {
|
|
op_name_filter = argv[++i];
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
} else if (strcmp(argv[i], "-b") == 0) {
|
|
if (i + 1 < argc) {
|
|
backend_filter = argv[++i];
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
// enumerate backends
|
|
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
|
|
|
size_t n_ok = 0;
|
|
|
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
|
|
printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
|
|
|
|
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
|
|
printf(" Skipping\n");
|
|
n_ok++;
|
|
continue;
|
|
}
|
|
|
|
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
|
GGML_ASSERT(backend != NULL);
|
|
|
|
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
|
printf(" Skipping CPU backend\n");
|
|
ggml_backend_free(backend);
|
|
n_ok++;
|
|
continue;
|
|
}
|
|
|
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
|
if (ggml_backend_set_n_threads_fn) {
|
|
// TODO: better value for n_threads
|
|
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
|
|
}
|
|
|
|
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
|
|
size_t free, total; // NOLINT
|
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
|
|
printf("\n");
|
|
|
|
bool ok = test_backend(backend, mode, op_name_filter);
|
|
|
|
printf(" Backend %s: ", ggml_backend_name(backend));
|
|
if (ok) {
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
n_ok++;
|
|
} else {
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
ggml_backend_free(backend);
|
|
}
|
|
|
|
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
|
|
|
|
if (n_ok != ggml_backend_dev_count()) {
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
return 1;
|
|
}
|
|
|
|
ggml_quantize_free();
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
return 0;
|
|
}
|