2024-09-03 17:21:46 +02:00
|
|
|
// This file defines tests for various GGML ops and backends.
|
|
|
|
// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
|
|
|
|
// For the backwards pass it asserts that the gradients from backpropagation are consistent
|
|
|
|
// with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
|
|
|
|
// It is also possible to check the performance ("perf" mode).
|
|
|
|
//
|
|
|
|
// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
|
|
|
|
// and section 3 defines which tests to run.
|
|
|
|
// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
|
|
|
|
// then go to section 3 and add an instantiation of your struct.
|
|
|
|
|
|
|
|
|
|
|
|
// ##############################
|
|
|
|
// ## Section 1: General Setup ##
|
|
|
|
// ##############################
|
|
|
|
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
#include <ggml.h>
|
|
|
|
#include <ggml-alloc.h>
|
|
|
|
#include <ggml-backend.h>
|
2024-05-12 19:40:45 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
#include <algorithm>
|
|
|
|
#include <array>
|
|
|
|
#include <cfloat>
|
2024-09-03 17:21:46 +02:00
|
|
|
#include <cstdint>
|
2023-12-07 21:26:54 +01:00
|
|
|
#include <cstring>
|
2024-09-08 08:57:57 +02:00
|
|
|
#include <cinttypes>
|
2023-12-07 21:26:54 +01:00
|
|
|
#include <functional>
|
|
|
|
#include <memory>
|
|
|
|
#include <random>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string>
|
|
|
|
#include <thread>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
2024-01-17 17:54:56 +01:00
|
|
|
// static RNG initialization (revisit if n_threads stops being constant)
|
|
|
|
static const size_t n_threads = std::thread::hardware_concurrency();
|
|
|
|
static std::vector<std::default_random_engine> generators = []() {
|
|
|
|
std::random_device rd;
|
|
|
|
std::vector<std::default_random_engine> vec;
|
|
|
|
vec.reserve(n_threads);
|
|
|
|
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
|
|
|
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
|
|
|
return vec;
|
|
|
|
}();
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t size = ggml_nelements(tensor);
|
|
|
|
std::vector<float> data(size);
|
|
|
|
|
2024-01-17 17:54:56 +01:00
|
|
|
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
2023-12-07 21:26:54 +01:00
|
|
|
std::uniform_real_distribution<float> distribution(min, max);
|
|
|
|
for (size_t i = start; i < end; i++) {
|
2024-01-17 17:54:56 +01:00
|
|
|
data[i] = distribution(generators[ith]);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<std::thread> threads;
|
|
|
|
threads.reserve(n_threads);
|
|
|
|
for (size_t i = 0; i < n_threads; i++) {
|
|
|
|
size_t start = i*size/n_threads;
|
|
|
|
size_t end = (i+1)*size/n_threads;
|
2024-01-17 17:54:56 +01:00
|
|
|
threads.emplace_back(init_thread, i, start, end);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
for (auto & t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
2024-05-18 02:39:54 +02:00
|
|
|
#if 0
|
|
|
|
const char * val_str = getenv("GGML_TEST_EPS");
|
|
|
|
float val = 1e-9f;
|
|
|
|
if (val_str != nullptr) {
|
|
|
|
val = std::stof(val_str);
|
|
|
|
printf("GGML_TEST_EPS=%e\n", val);
|
|
|
|
}
|
|
|
|
|
|
|
|
// test quantization with very small values that may result in nan scales due to division by zero
|
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
|
|
for (int i = 0; i < 256; i++) {
|
|
|
|
data[i] = val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
2024-05-08 08:30:09 +02:00
|
|
|
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
2023-12-07 21:26:54 +01:00
|
|
|
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
2023-12-14 20:05:21 +01:00
|
|
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
2024-01-17 17:54:56 +01:00
|
|
|
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
|
|
|
const float * im = imatrix.data();
|
|
|
|
if (!ggml_quantize_requires_imatrix(tensor->type)) {
|
|
|
|
// when the imatrix is optional, we want to test both quantization with and without imatrix
|
|
|
|
// use one of the random numbers to decide
|
|
|
|
if (data[0] > 0.5f*(min + max)) {
|
|
|
|
im = nullptr;
|
|
|
|
}
|
|
|
|
}
|
2024-07-19 17:17:27 +02:00
|
|
|
|
2024-03-09 14:53:59 +01:00
|
|
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
2024-05-18 02:39:54 +02:00
|
|
|
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
2024-07-19 17:17:27 +02:00
|
|
|
// TODO: other cases
|
|
|
|
//#pragma omp parallel for
|
|
|
|
//for (int i = 0; i < tensor->ne[1]; i++) {
|
|
|
|
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
|
|
|
// i * tensor->ne[0], 1, tensor->ne[0], im);
|
|
|
|
//}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
2023-12-29 18:07:03 +01:00
|
|
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
|
|
|
// This is going to create some weird integers though.
|
|
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
|
std::vector<float> tv;
|
|
|
|
tv.reserve(ggml_nelements(t));
|
|
|
|
|
|
|
|
std::vector<uint8_t> buf(ggml_nbytes(t));
|
|
|
|
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
|
|
|
|
size_t bs = ggml_blck_size(t->type);
|
2023-12-14 20:05:21 +01:00
|
|
|
std::vector<float> vq(ggml_blck_size(t->type));
|
|
|
|
bool quantized = ggml_is_quantized(t->type);
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// access elements by index to avoid gaps in views
|
|
|
|
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
|
|
|
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
|
|
|
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
2023-12-13 13:04:25 +01:00
|
|
|
for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
|
|
|
|
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
2023-12-07 21:26:54 +01:00
|
|
|
if (t->type == GGML_TYPE_F16) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
2024-05-08 08:30:09 +02:00
|
|
|
} else if (t->type == GGML_TYPE_BF16) {
|
|
|
|
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else if (t->type == GGML_TYPE_F32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back(*(float *) &buf[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
} else if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back((float)*(int32_t *) &buf[i]);
|
2023-12-29 18:07:03 +01:00
|
|
|
} else if (t->type == GGML_TYPE_I16) {
|
|
|
|
tv.push_back((float)*(int16_t *) &buf[i]);
|
|
|
|
} else if (t->type == GGML_TYPE_I8) {
|
|
|
|
tv.push_back((float)*(int8_t *) &buf[i]);
|
2023-12-14 20:05:21 +01:00
|
|
|
} else if (quantized) {
|
2024-04-18 15:18:48 +02:00
|
|
|
tt.to_float(&buf[i], vq.data(), bs);
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return tv;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
static double cosine_similarity(const float * v1, const float * v2, size_t n) {
|
|
|
|
double dot = 0.0;
|
|
|
|
double mag1 = 0.0;
|
|
|
|
double mag2 = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v1[i]) || std::isnan(v2[i])) {
|
|
|
|
return -1.0f;
|
|
|
|
}
|
|
|
|
if (std::isinf(v1[i]) && std::isinf(v2[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
dot += v1[i]*v2[i];
|
|
|
|
mag1 += v1[i]*v1[i];
|
|
|
|
mag2 += v2[i]*v2[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
return dot/sqrt(mag1*mag2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static float distance(const float * v1, const float * v2, size_t n) {
|
|
|
|
double d = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v1[i]) || std::isnan(v2[i])) {
|
|
|
|
return INFINITY;
|
|
|
|
}
|
|
|
|
if (std::isinf(v1[i]) && std::isinf(v2[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
d += (v1[i] - v2[i])*(v1[i] - v2[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return sqrt(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
static float vec_len(const float * v, size_t n) {
|
|
|
|
double d = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v[i])) {
|
|
|
|
return INFINITY;
|
|
|
|
}
|
|
|
|
if (std::isinf(v[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
d += v[i]*v[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
return sqrt(d);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
|
|
|
static double nmse(const float * a, const float * b, size_t n) {
|
|
|
|
double mse_a_b = 0.0;
|
|
|
|
double mse_a_0 = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
float a_i = a[i];
|
|
|
|
float b_i = b[i];
|
|
|
|
|
|
|
|
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
|
|
|
mse_a_0 += a_i * a_i;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mse_a_b / mse_a_0;
|
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
// maximum absolute asymmetry between a and b
|
|
|
|
// asymmetry: (a - b) / (a + b)
|
|
|
|
// This is more stable than relative error if one of the values fluctuates towards zero.
|
|
|
|
// n: number of values to compare.
|
|
|
|
// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
|
|
|
|
// a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
|
|
|
|
static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
|
|
|
|
double sum = 0.0f;
|
|
|
|
|
|
|
|
size_t nvalid = 0;
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (!expected_vals.empty()) {
|
|
|
|
bool matches_any = false;
|
|
|
|
for (const float & ev : expected_vals) {
|
|
|
|
if (fabsf(a[i] - ev) < 1e-3f) {
|
|
|
|
matches_any = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!matches_any) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
|
|
|
|
|
|
|
|
sum += fabsf(asymm);
|
|
|
|
nvalid++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return sum/nvalid;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// utils for printing the variables of the test cases
|
|
|
|
#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
static std::string var_to_str(const T & x) {
|
|
|
|
return std::to_string(x);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T, size_t N>
|
|
|
|
static std::string var_to_str(const T (&x)[N]) {
|
|
|
|
std::string s = "[";
|
|
|
|
for (size_t i = 0; i < N; i++) {
|
|
|
|
if (i > 0) {
|
|
|
|
s += ",";
|
|
|
|
}
|
|
|
|
s += var_to_str(x[i]);
|
|
|
|
}
|
|
|
|
s += "]";
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T, size_t N>
|
|
|
|
static std::string var_to_str(const std::array<T, N> & x) {
|
|
|
|
std::string s = "[";
|
|
|
|
for (size_t i = 0; i < N; i++) {
|
|
|
|
if (i > 0) {
|
|
|
|
s += ",";
|
|
|
|
}
|
|
|
|
s += var_to_str(x[i]);
|
|
|
|
}
|
|
|
|
s += "]";
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
//static std::string var_to_str(ggml_unary_op unary_op) {
|
|
|
|
// return ggml_unary_op_name(unary_op);
|
|
|
|
//}
|
|
|
|
|
|
|
|
static std::string var_to_str(ggml_type type) {
|
|
|
|
return ggml_type_name(type);
|
|
|
|
}
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
static std::string var_to_str(ggml_op_pool pool) {
|
|
|
|
switch (pool) {
|
|
|
|
case GGML_OP_POOL_AVG: return "avg";
|
|
|
|
case GGML_OP_POOL_MAX: return "max";
|
|
|
|
default: return std::to_string(pool);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
#define VARS_TO_STR1(a) VAR_TO_STR(a)
|
|
|
|
#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
|
|
|
|
#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
|
|
|
|
#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
|
|
|
|
#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
|
|
|
|
#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
|
|
|
|
#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
|
|
|
|
#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
|
|
|
|
#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
|
|
|
|
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
|
|
|
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
2024-01-31 14:10:15 +01:00
|
|
|
#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 16:56:23 +01:00
|
|
|
#ifdef GGML_USE_SYCL
|
|
|
|
static bool inline _isinf(float f) {
|
|
|
|
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static bool inline _isinf(float f) { return std::isinf(f); }
|
|
|
|
#endif
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// accept FLT_MAX as infinity
|
|
|
|
static bool isinf_or_max(float f) {
|
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 16:56:23 +01:00
|
|
|
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool ggml_is_view_op(enum ggml_op op) {
|
|
|
|
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
enum test_mode {
|
|
|
|
MODE_TEST,
|
|
|
|
MODE_PERF,
|
2024-09-03 17:21:46 +02:00
|
|
|
MODE_GRAD,
|
2023-12-13 20:54:54 +01:00
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
struct test_case {
|
|
|
|
virtual ~test_case() {}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
virtual std::string op_desc(ggml_tensor * t) {
|
|
|
|
return ggml_op_desc(t);
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
virtual std::string vars() {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
|
|
|
|
|
|
|
|
virtual double max_nmse_err() {
|
2023-12-13 13:04:25 +01:00
|
|
|
return 1e-7;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
virtual double max_maa_err() {
|
|
|
|
return 1e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual float grad_eps(){
|
|
|
|
return 1e-1f;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
|
|
|
|
// If true, estimate gradient with 4 points, neglects 5th order derivative and higher.
|
|
|
|
virtual bool grad_precise(){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
|
|
|
|
virtual int64_t grad_nmax() {
|
|
|
|
return 10000;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No effect if empty.
|
|
|
|
// If not empty, skip all gradient checks where the numerical result does not match any of the values.
|
|
|
|
// Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
|
|
|
|
virtual std::vector<float> grad_expect() {
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
virtual void initialize_tensors(ggml_context * ctx) {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual size_t op_size(ggml_tensor * t) {
|
|
|
|
size_t size = ggml_nbytes(t);
|
|
|
|
// add source tensors
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
|
|
if (t->src[i] != NULL) {
|
|
|
|
size += ggml_nbytes(t->src[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_cgraph * gf = nullptr;
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_cgraph * gb = nullptr;
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
static const int sentinel_size = 1024;
|
|
|
|
|
|
|
|
test_mode mode;
|
|
|
|
|
|
|
|
std::vector<ggml_tensor *> sentinels;
|
|
|
|
|
|
|
|
void add_sentinel(ggml_context * ctx) {
|
2024-09-03 17:21:46 +02:00
|
|
|
if (mode == MODE_PERF || mode == MODE_GRAD) {
|
2023-12-13 20:54:54 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
|
|
|
|
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
|
|
|
|
sentinels.push_back(sentinel);
|
|
|
|
}
|
|
|
|
|
|
|
|
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
|
2023-12-13 20:54:54 +01:00
|
|
|
mode = MODE_TEST;
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_init_params params = {
|
|
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
|
|
|
/* .mem_base = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
ggml_context * ctx = ggml_init(params);
|
2024-09-03 17:21:46 +02:00
|
|
|
GGML_ASSERT(ctx);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
gf = ggml_new_graph(ctx);
|
|
|
|
|
|
|
|
// pre-graph sentinel
|
|
|
|
add_sentinel(ctx);
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
fflush(stdout);
|
|
|
|
|
2024-01-29 21:50:50 +01:00
|
|
|
// check if the backends support the ops
|
2023-12-29 09:32:31 +01:00
|
|
|
bool supported = true;
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_backend_t backend : {backend1, backend2}) {
|
2024-01-29 21:50:50 +01:00
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
2023-12-29 09:32:31 +01:00
|
|
|
if (!supported) {
|
|
|
|
printf("\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// post-graph sentinel
|
|
|
|
add_sentinel(ctx);
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// allocate
|
|
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
|
2024-01-12 20:07:38 +01:00
|
|
|
if (buf == NULL) {
|
|
|
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
|
|
|
|
ggml_free(ctx);
|
|
|
|
return false;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// build graph
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// add sentinels as graph nodes so that they are checked in the callback
|
|
|
|
for (ggml_tensor * sentinel : sentinels) {
|
2024-09-12 13:23:49 +02:00
|
|
|
ggml_graph_add_node(gf, sentinel);
|
2023-12-13 20:54:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// randomize tensors
|
|
|
|
initialize_tensors(ctx);
|
|
|
|
|
|
|
|
// compare
|
|
|
|
struct callback_userdata {
|
|
|
|
bool ok;
|
|
|
|
double max_err;
|
2024-01-04 09:43:23 +01:00
|
|
|
ggml_backend_t backend1;
|
|
|
|
ggml_backend_t backend2;
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
callback_userdata ud {
|
|
|
|
true,
|
|
|
|
max_nmse_err(),
|
2024-01-04 09:43:23 +01:00
|
|
|
backend1,
|
|
|
|
backend2
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
|
2023-12-13 20:54:54 +01:00
|
|
|
callback_userdata * ud = (callback_userdata *) user_data;
|
2024-01-04 09:43:23 +01:00
|
|
|
const char * bn1 = ggml_backend_name(ud->backend1);
|
|
|
|
const char * bn2 = ggml_backend_name(ud->backend2);
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
if (t1->op == GGML_OP_NONE) {
|
|
|
|
// sentinels must be unchanged
|
|
|
|
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
|
|
|
|
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
|
|
|
|
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
|
|
|
|
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
|
|
|
|
|
|
|
|
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
|
|
|
|
printf("sentinel mismatch: %s ", t1->name);
|
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
std::vector<float> f1 = tensor_to_float(t1);
|
|
|
|
std::vector<float> f2 = tensor_to_float(t2);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < f1.size(); i++) {
|
|
|
|
// check for nans
|
|
|
|
if (std::isnan(f1[i]) || std::isnan(f2[i])) {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// check for infs: both must be inf of the same sign, or both must be finite
|
|
|
|
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
|
|
|
|
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
|
|
|
|
if (std::signbit(f1[i]) != std::signbit(f2[i])) {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
double err = nmse(f1.data(), f2.data(), f1.size());
|
|
|
|
if (err > ud->max_err) {
|
2024-01-09 08:58:55 +01:00
|
|
|
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
|
2024-01-02 09:57:44 +01:00
|
|
|
//for (int i = 0; i < (int) f1.size(); i++) {
|
2023-12-13 20:54:54 +01:00
|
|
|
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
2023-12-13 13:04:25 +01:00
|
|
|
//}
|
|
|
|
//printf("\n");
|
2023-12-13 20:54:54 +01:00
|
|
|
//exit(1);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
}
|
|
|
|
return true;
|
2023-12-13 13:04:25 +01:00
|
|
|
|
|
|
|
GGML_UNUSED(index);
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
if (!cmp_ok) {
|
|
|
|
printf("compare failed ");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
if (ud.ok && cmp_ok) {
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
return false;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
2023-12-13 20:54:54 +01:00
|
|
|
mode = MODE_PERF;
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
static const size_t graph_nodes = 8192;
|
|
|
|
|
|
|
|
ggml_init_params params = {
|
|
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
|
|
|
|
/* .mem_base = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
ggml_context * ctx = ggml_init(params);
|
2024-09-03 17:21:46 +02:00
|
|
|
GGML_ASSERT(ctx);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
fflush(stdout);
|
|
|
|
|
|
|
|
// check if backends support op
|
|
|
|
if (!ggml_backend_supports_op(backend, out)) {
|
|
|
|
printf("not supported\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// align while also leaving some margin for variations in parameters
|
|
|
|
int align = 20;
|
|
|
|
int last = (len + align - 1) / align * align;
|
|
|
|
if (last - len < 5) {
|
|
|
|
last += align;
|
|
|
|
}
|
|
|
|
last = std::max(last, 60);
|
|
|
|
printf("%*s", last - len, "");
|
|
|
|
|
|
|
|
// allocate
|
|
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
2024-01-12 20:07:38 +01:00
|
|
|
if (buf == NULL) {
|
|
|
|
printf("failed to allocate tensors\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return false;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// randomize tensors
|
|
|
|
initialize_tensors(ctx);
|
|
|
|
|
|
|
|
// build graph
|
|
|
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
|
|
|
|
// warmup run
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
|
|
|
|
// duplicate the op
|
|
|
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
2024-09-12 13:23:49 +02:00
|
|
|
int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
2023-12-07 21:26:54 +01:00
|
|
|
for (int i = 1; i < n_runs; i++) {
|
2024-09-12 13:23:49 +02:00
|
|
|
ggml_graph_add_node(gf, out);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// calculate memory
|
|
|
|
size_t mem = n_runs * op_size(out);
|
|
|
|
auto tensor_op_size = [](ggml_tensor * t) {
|
|
|
|
size_t size = ggml_nbytes(t);
|
|
|
|
// add source tensors
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
|
|
if (t->src[i] != NULL) {
|
|
|
|
size += ggml_nbytes(t->src[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
};
|
2024-09-12 13:23:49 +02:00
|
|
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
|
|
|
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
2023-12-07 21:26:54 +01:00
|
|
|
continue;
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2024-09-12 13:23:49 +02:00
|
|
|
mem += tensor_op_size(ggml_graph_node(gf, i));
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// run
|
|
|
|
ggml_backend_synchronize(backend);
|
|
|
|
|
|
|
|
int64_t start_time = ggml_time_us();
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_synchronize(backend);
|
|
|
|
int64_t end_time = ggml_time_us();
|
|
|
|
double time_us = end_time - start_time;
|
|
|
|
|
|
|
|
printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
|
|
|
|
n_runs,
|
|
|
|
time_us / n_runs,
|
|
|
|
op_size(out) / 1024,
|
|
|
|
mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
|
|
|
|
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
bool eval_grad(ggml_backend_t backend, const char * op_name) {
|
|
|
|
mode = MODE_GRAD;
|
|
|
|
const std::vector<float> expect = grad_expect();
|
|
|
|
|
|
|
|
ggml_init_params params = {
|
|
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
|
|
|
|
/* .mem_base = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
ggml_context * ctx = ggml_init(params);
|
|
|
|
GGML_ASSERT(ctx);
|
|
|
|
|
|
|
|
gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
|
|
|
|
gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
|
|
|
|
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
|
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
|
|
|
fflush(stdout);
|
|
|
|
|
|
|
|
if (out->grad == nullptr) {
|
|
|
|
printf("backwards pass not supported \n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (out->type != GGML_TYPE_F32) {
|
|
|
|
ggml_free(ctx);
|
|
|
|
printf("not supported [%s->type != FP32]\n", out->name);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if the backend supports the ops
|
|
|
|
bool supported = true;
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
|
|
|
|
printf("not supported [%s->type != FP32] ", t->name);
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!supported) {
|
|
|
|
printf("\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t ngrads = 0;
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->flags & GGML_TENSOR_FLAG_PARAM) {
|
|
|
|
ngrads += ggml_nelements(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ngrads > grad_nmax()) {
|
|
|
|
printf("skipping large tensors for speed \n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!ggml_is_scalar(out)) {
|
|
|
|
out = ggml_sum(ctx, out);
|
|
|
|
ggml_set_name(out, "sum_of_out");
|
|
|
|
}
|
2024-09-20 18:04:44 +02:00
|
|
|
ggml_set_loss(out);
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
ggml_graph_cpy(gf, gb);
|
2024-09-20 18:04:44 +02:00
|
|
|
ggml_build_backward_expand(ctx, gf, gb, false, false);
|
2024-09-03 17:21:46 +02:00
|
|
|
if (expect.size() != 1 || expect[0] != 0.0f) {
|
2024-09-12 13:23:49 +02:00
|
|
|
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
2024-09-03 17:21:46 +02:00
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: refactor so that this check is only needed once
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
|
|
|
|
printf("not supported [%s->type != FP32] ", t->name);
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!supported) {
|
|
|
|
printf("\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// allocate
|
|
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
|
|
if (buf == NULL) {
|
|
|
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
|
|
|
|
ggml_free(ctx);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-09-20 18:04:44 +02:00
|
|
|
initialize_tensors(ctx); // Randomizes all tensors (including gradients).
|
|
|
|
ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_graph_compute(backend, gb);
|
|
|
|
|
|
|
|
bool ok = true;
|
|
|
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char * bn = ggml_backend_name(backend);
|
|
|
|
const int64_t ne = ggml_nelements(t);
|
|
|
|
|
|
|
|
std::vector<float> ga = tensor_to_float(t->grad);
|
|
|
|
|
|
|
|
for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
|
|
|
|
// check for nans
|
|
|
|
if (!std::isfinite(ga[i])) {
|
2024-09-08 08:57:57 +02:00
|
|
|
printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ok = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!ok) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<float> gn(ne); // gradient numeric
|
|
|
|
GGML_ASSERT(ga.size() == gn.size());
|
|
|
|
|
|
|
|
std::vector<float> x0 = tensor_to_float(t); // original t data
|
|
|
|
GGML_ASSERT(ggml_is_scalar(out));
|
|
|
|
GGML_ASSERT(out->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
const float eps = grad_eps();
|
|
|
|
for (int64_t i = 0; i < ne; ++i) {
|
|
|
|
const float xiu = x0[i] + 1.0f*eps; // x, index i, up
|
|
|
|
const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
|
|
|
|
const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
|
|
|
|
const float xid = x0[i] - 1.0f*eps; // x, index i, down
|
|
|
|
|
|
|
|
float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
|
|
|
|
|
|
|
|
ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
|
|
|
|
|
|
|
|
ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
|
|
|
|
|
|
|
|
if (grad_precise()) {
|
|
|
|
ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
|
|
|
|
|
|
|
|
ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
|
|
|
|
|
|
|
|
gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
|
|
|
|
} else {
|
|
|
|
gn[i] = (fu - fd) / (2.0f*eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
|
|
|
|
}
|
|
|
|
|
|
|
|
const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
|
|
|
|
if (err > max_maa_err()) {
|
|
|
|
printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
|
|
|
|
ok = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!ok) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ok) {
|
|
|
|
printf("compare failed ");
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
|
|
|
if (ok) {
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// ###################################
|
|
|
|
// ## Section 2: GGML Op Defintions ##
|
|
|
|
// ###################################
|
|
|
|
|
|
|
|
|
|
|
|
// The following is an example showing the bare minimum for creating a test for a GGML op.
|
|
|
|
|
|
|
|
// GGML_OP_EXAMPLE
|
|
|
|
struct test_example : public test_case {
|
|
|
|
// Always define these 2 or variants thereof:
|
|
|
|
const ggml_type type; // The type of the input tensors.
|
|
|
|
const std::array<int64_t, 4> ne; // The shape of the input tensors.
|
|
|
|
// For some ops it's necessary to define multiple types or shapes for the inputs.
|
|
|
|
// Or they may need additional parameters.
|
|
|
|
|
|
|
|
// Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
|
|
|
|
// In most cases these are just the properties of the struct that you defined above.
|
|
|
|
// This is needed for info prints.
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Define a constructor for the struct.
|
|
|
|
// In most cases it will be sufficient to have the same arguments as the struct has properties
|
|
|
|
// and just use initializer lists.
|
|
|
|
test_example(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
// Define how a simple GGML compute graph can be constructed for the new GGML op.
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
// Step 1: create input tensors that don't depend on any other tensors:
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
|
|
|
|
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
|
|
|
// Step 2: use the op that you want to test in the GGML compute graph.
|
|
|
|
ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
// Step 3: return the output tensor.
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
// In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
|
|
|
|
// immediately after you create the tensors.
|
|
|
|
// This is optional and only makes sense if a backwards pass has actually been implemented for the new op.
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_UNARY
|
|
|
|
struct test_unary : public test_case {
|
|
|
|
const ggml_unary_op op;
|
|
|
|
const ggml_type type;
|
2024-06-12 15:00:22 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
int v; // view (1 : non-contiguous a)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-06-12 15:00:22 +02:00
|
|
|
return VARS_TO_STR3(type, ne_a, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_unary(ggml_unary_op op,
|
|
|
|
ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
|
2024-06-12 15:00:22 +02:00
|
|
|
int v = 0)
|
|
|
|
: op(op), type(type), ne_a(ne_a), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-09-03 17:21:46 +02:00
|
|
|
const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
|
|
|
|
op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
|
|
|
|
|
2024-06-12 15:00:22 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
if (grad_supported) {
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
}
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-06-12 15:00:22 +02:00
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "view_of_a");
|
2024-06-12 15:00:22 +02:00
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
if (grad_supported) {
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
}
|
|
|
|
ggml_set_name(a, "a");
|
2024-06-12 15:00:22 +02:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-06-12 15:00:22 +02:00
|
|
|
ggml_tensor * out = ggml_unary(ctx, a, op);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
// test extended range of values to check for NaNs in GELU
|
|
|
|
init_tensor_uniform(t, -150.f, 150.f);
|
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 15.0f;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<float> grad_expect() override {
|
|
|
|
if (op == GGML_UNARY_OP_ABS) {
|
|
|
|
return {-1.0f, 1.0f};
|
|
|
|
}
|
|
|
|
if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
|
|
|
|
return {0.0f};
|
|
|
|
}
|
|
|
|
if (op == GGML_UNARY_OP_RELU) {
|
|
|
|
return {0.0f, 1.0f};
|
|
|
|
}
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_GET_ROWS
|
|
|
|
struct test_get_rows : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const int n; // cols
|
|
|
|
const int m; // rows
|
|
|
|
const int r; // rows to get
|
2023-12-13 13:04:25 +01:00
|
|
|
const int b; // batch size
|
|
|
|
const bool v; // view (non-contiguous src1)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-13 13:04:25 +01:00
|
|
|
return VARS_TO_STR6(type, n, m, r, b, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
|
|
|
|
: type(type), n(n), m(m), r(r), b(b), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(in, "in");
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(rows, "rows");
|
2023-12-13 13:04:25 +01:00
|
|
|
if (v) {
|
|
|
|
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(rows, "view_of_rows");
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
|
|
|
|
if (grad_supported) {
|
|
|
|
ggml_set_param(ctx, in);
|
|
|
|
// rows is a constant input -> no gradients
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_get_rows(ctx, in, rows);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
2023-12-07 21:26:54 +01:00
|
|
|
// rows
|
2023-12-13 13:04:25 +01:00
|
|
|
std::vector<int> data(r*b);
|
|
|
|
for (int i = 0; i < r*b; i++) {
|
2023-12-07 21:26:54 +01:00
|
|
|
data[i] = rand() % m;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_REPEAT
|
|
|
|
struct test_repeat : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int, 4> nr;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) * 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_repeat(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
2023-12-07 21:26:54 +01:00
|
|
|
std::array<int, 4> nr = {2, 2, 2, 2})
|
|
|
|
: type(type), ne(ne), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(target, "target");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, src);
|
|
|
|
ggml_set_name(src, "src");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_repeat(ctx, src, target);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_DUP
|
|
|
|
struct test_dup : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2023-12-29 18:07:03 +01:00
|
|
|
const std::array<int64_t, 4> permute;
|
|
|
|
bool _use_permute;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-29 18:07:03 +01:00
|
|
|
std::string v = VARS_TO_STR2(type, ne);
|
|
|
|
if (_use_permute) v += "," + VAR_TO_STR(permute);
|
|
|
|
return v;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_dup(ggml_type type = GGML_TYPE_F32,
|
2024-07-17 13:23:50 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 20, 1},
|
2023-12-29 18:07:03 +01:00
|
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
|
|
|
: type(type), ne(ne), permute(permute),
|
|
|
|
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, src);
|
|
|
|
ggml_set_name(src, "src");
|
|
|
|
|
2023-12-29 18:07:03 +01:00
|
|
|
if (_use_permute) {
|
|
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(src, "src_permuted");
|
2023-12-29 18:07:03 +01:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_dup(ctx, src);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SET
|
|
|
|
struct test_set : public test_case {
|
|
|
|
const ggml_type type_src;
|
|
|
|
const ggml_type type_dst;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int dim;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type_src, type_dst, ne, dim);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
|
|
|
|
: type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
|
|
|
ggml_set_param(ctx, src);
|
|
|
|
ggml_set_name(src, "src");
|
|
|
|
|
|
|
|
auto ne_dst = ne;
|
|
|
|
for (int i = 0; i < dim; ++i) {
|
|
|
|
ne_dst[i] *= 2;
|
|
|
|
}
|
|
|
|
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
|
|
|
|
ggml_set_param(ctx, dst);
|
|
|
|
ggml_set_name(dst, "dst");
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
for (int i = 0; i < dim; ++i) {
|
|
|
|
offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
|
|
|
|
}
|
|
|
|
ggml_tensor * out = ggml_set(ctx, dst, src,
|
|
|
|
// The backwards pass requires setting a contiguous region:
|
|
|
|
src->nb[1], src->nb[2], src->nb[3], offset);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CPY
|
|
|
|
struct test_cpy : public test_case {
|
|
|
|
const ggml_type type_src;
|
|
|
|
const ggml_type type_dst;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2024-07-17 13:23:50 +02:00
|
|
|
const std::array<int64_t, 4> permute;
|
|
|
|
bool _src_use_permute;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-07-17 13:23:50 +02:00
|
|
|
return VARS_TO_STR4(type_src, type_dst, ne, permute);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-06-23 13:14:45 +02:00
|
|
|
double max_nmse_err() override {
|
|
|
|
return 1e-6;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
2024-07-17 13:23:50 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
2024-08-01 15:26:22 +02:00
|
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
2024-07-17 13:23:50 +02:00
|
|
|
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
|
|
|
|
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, src);
|
|
|
|
ggml_set_name(src, "src");
|
|
|
|
|
2024-07-17 13:23:50 +02:00
|
|
|
if (_src_use_permute) {
|
|
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(src, "src_permuted");
|
2024-07-17 13:23:50 +02:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-07-17 13:23:50 +02:00
|
|
|
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(dst, "dst");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CONT
|
|
|
|
struct test_cont : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cont(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, src);
|
|
|
|
ggml_set_name(src, "src");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
src = ggml_transpose(ctx, src);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(src, "src_transposed");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_cont(ctx, src);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ADD
|
|
|
|
// GGML_OP_MUL
|
|
|
|
// GGML_OP_DIV
|
|
|
|
struct test_bin_bcast : public test_case {
|
|
|
|
using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
|
|
|
|
op_t op;
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int, 4> nr;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) * 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 1, 1},
|
|
|
|
std::array<int, 4> nr = {1, 2, 1, 1})
|
|
|
|
: op(op), type(type), ne(ne), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
|
|
|
// The backwards pass supports broadcasting only for GGML_ADD:
|
|
|
|
const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
|
|
|
|
if (grad_supported) {
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_param(ctx, b);
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = op(ctx, a, b);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
2024-09-03 17:21:46 +02:00
|
|
|
if (op == ggml_mul || op == ggml_div) {
|
|
|
|
// MUL and DIV have numerical issues around zero:
|
|
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return op == ggml_div;
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_maa_err() override {
|
|
|
|
return op == ggml_add ? 1e-4 : 1e-3;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ADD1
|
|
|
|
struct test_add1 : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_add1(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
|
|
|
|
// ggml_set_param(ctx, b); // TODO: implement
|
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
|
|
|
ggml_tensor * out = ggml_add1(ctx, a, b);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SCALE
|
|
|
|
struct test_scale : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2023-12-21 22:20:49 +01:00
|
|
|
float scale;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-21 22:20:49 +01:00
|
|
|
return VARS_TO_STR3(type, ne, scale);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_scale(ggml_type type = GGML_TYPE_F32,
|
2023-12-21 22:20:49 +01:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
|
|
float scale = 2.0f)
|
|
|
|
: type(type), ne(ne), scale(scale) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_scale(ctx, a, scale);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_NORM
|
|
|
|
struct test_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float eps;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_norm(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
2023-12-07 21:26:54 +01:00
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_norm(ctx, a, eps);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_RMS_NORM
|
|
|
|
struct test_rms_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float eps;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_rms_norm(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
2023-12-07 21:26:54 +01:00
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-08-26 16:55:36 +02:00
|
|
|
// GGML_OP_SSM_CONV
|
|
|
|
struct test_ssm_conv : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_ssm_conv(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
|
|
|
std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
|
|
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
|
|
ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SSM_SCAN
|
|
|
|
struct test_ssm_scan : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
|
|
|
|
const int64_t d_state;
|
|
|
|
const int64_t d_inner;
|
|
|
|
const int64_t n_seq_tokens;
|
|
|
|
const int64_t n_seqs;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_ssm_scan(ggml_type type = GGML_TYPE_F32,
|
|
|
|
int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
|
|
: type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
|
|
|
|
ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-09-22 04:29:12 +02:00
|
|
|
// GGML_OP_RWKV_WKV
|
|
|
|
struct test_rwkv_wkv : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
|
|
|
|
const int64_t head_count;
|
|
|
|
const int64_t head_size;
|
|
|
|
const int64_t n_seq_tokens;
|
|
|
|
const int64_t n_seqs;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
|
|
|
|
int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
|
|
: type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
const int64_t n_tokens = n_seq_tokens * n_seqs;
|
|
|
|
ggml_tensor * r = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
|
|
ggml_tensor * k = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ head_size, 1, head_count, n_tokens }.data());
|
|
|
|
ggml_tensor * v = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
|
|
ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
|
|
|
|
ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
|
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
|
|
|
|
ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_MUL_MAT
|
|
|
|
struct test_mul_mat : public test_case {
|
|
|
|
const ggml_type type_a;
|
|
|
|
const ggml_type type_b;
|
|
|
|
const int64_t m;
|
|
|
|
const int64_t n;
|
|
|
|
const int64_t k;
|
|
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
|
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
|
|
|
|
size_t b = ggml_nbytes(t->src[1]) * m;
|
|
|
|
size_t c = ggml_nbytes(t);
|
|
|
|
return a + b + c;
|
|
|
|
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
|
|
std::array<int64_t, 2> nr = {2, 2})
|
|
|
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
|
|
|
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_param(ctx, b);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_MUL_MAT_ID
|
|
|
|
struct test_mul_mat_id : public test_case {
|
|
|
|
const ggml_type type_a;
|
|
|
|
const ggml_type type_b;
|
|
|
|
const int n_mats;
|
2024-04-18 15:18:48 +02:00
|
|
|
const int n_used;
|
|
|
|
const bool b; // brodcast b matrix
|
2023-12-07 21:26:54 +01:00
|
|
|
const int64_t m;
|
|
|
|
const int64_t n;
|
|
|
|
const int64_t k;
|
|
|
|
|
|
|
|
std::string vars() override {
|
2024-04-18 15:18:48 +02:00
|
|
|
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
size_t a = ggml_nbytes(t->src[2]) * n;
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t b = ggml_nbytes(t->src[1]) * m;
|
|
|
|
size_t c = ggml_nbytes(t);
|
|
|
|
return a + b + c;
|
|
|
|
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
2024-04-18 15:18:48 +02:00
|
|
|
int n_mats = 8, int n_used = 2, bool b = false,
|
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32)
|
|
|
|
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
|
|
|
|
m(m), n(n), k(k) {
|
|
|
|
GGML_ASSERT(n_used <= n_mats);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
2024-04-18 15:18:48 +02:00
|
|
|
ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(as, "as");
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(ids, "ids");
|
2024-04-18 15:18:48 +02:00
|
|
|
if (n_used != n_mats) {
|
|
|
|
ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(ids, "view_of_ids");
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
std::random_device rd;
|
|
|
|
std::default_random_engine rng(rd());
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
2023-12-07 21:26:54 +01:00
|
|
|
// ids
|
2023-12-13 13:04:25 +01:00
|
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
|
|
std::vector<int32_t> data(t->ne[0]);
|
|
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
|
|
data[i] = i % n_mats;
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-09-20 18:04:44 +02:00
|
|
|
// GGML_OP_OUT_PROD
|
|
|
|
struct test_out_prod : public test_case {
|
|
|
|
const ggml_type type_a;
|
|
|
|
const ggml_type type_b;
|
|
|
|
const int64_t m;
|
|
|
|
const int64_t n;
|
|
|
|
const int64_t k;
|
|
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
|
|
const bool trans_b;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
|
|
bool trans_b = false)
|
|
|
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
ggml_tensor * b;
|
|
|
|
if (trans_b) {
|
|
|
|
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
|
|
|
|
b = ggml_transpose(ctx, b);
|
|
|
|
} else {
|
|
|
|
b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
|
|
|
|
}
|
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
|
|
|
ggml_tensor * out = ggml_out_prod(ctx, a, b);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_SQR
|
|
|
|
struct test_sqr : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sqr(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
2023-12-07 21:26:54 +01:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_sqr(ctx, a);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-06-17 00:23:04 +02:00
|
|
|
// GGML_OP_SQRT
|
|
|
|
struct test_sqrt : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sqrt(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 3, 3, 2})
|
2024-06-17 00:23:04 +02:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-06-17 00:23:04 +02:00
|
|
|
ggml_tensor * out = ggml_sqrt(ctx, a);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-06-17 00:23:04 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
// fill with positive values
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
2024-09-03 17:21:46 +02:00
|
|
|
init_tensor_uniform(t, 50.0f, 100.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 20.0f;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_LOG
|
|
|
|
struct test_log : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_log(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
ggml_tensor * out = ggml_log(ctx, a);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
// log(1) == 0, cluster values there to keep the sum low for better precision in the backwards pass:
|
|
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
2024-06-17 00:23:04 +02:00
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2024-06-17 00:23:04 +02:00
|
|
|
};
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
// GGML_OP_SIN
|
|
|
|
struct test_sin : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sin(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 2, 2, 2})
|
2024-08-27 21:01:45 +02:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
ggml_tensor * out = ggml_sin(ctx, a);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
2024-09-03 17:21:46 +02:00
|
|
|
init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
|
2024-08-27 21:01:45 +02:00
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
double max_maa_err() override {
|
|
|
|
return 1e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.2f;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2024-08-27 21:01:45 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_COS
|
|
|
|
struct test_cos : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cos(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 2, 2, 2})
|
2024-08-27 21:01:45 +02:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
ggml_tensor * out = ggml_cos(ctx, a);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
2024-09-03 17:21:46 +02:00
|
|
|
init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
|
2024-08-27 21:01:45 +02:00
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
double max_maa_err() override {
|
|
|
|
return 1e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.2f;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2024-08-27 21:01:45 +02:00
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_CLAMP
|
|
|
|
struct test_clamp : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float min;
|
|
|
|
float max;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne, min, max);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_clamp(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
2023-12-07 21:26:54 +01:00
|
|
|
float min = -0.5f, float max = 0.5f)
|
|
|
|
: type(type), ne(ne), min(min), max(max) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_clamp(ctx, a, min, max);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 1e-2f;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<float> grad_expect() override {
|
|
|
|
return {0.0f, 1.0f};
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_DIAG_MASK_INF
|
|
|
|
struct test_diag_mask_inf : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int n_past;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, n_past);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 3, 2},
|
2023-12-07 21:26:54 +01:00
|
|
|
int n_past = 5)
|
|
|
|
: type(type), ne(ne), n_past(n_past) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SOFT_MAX
|
|
|
|
struct test_soft_max : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2024-01-29 21:50:50 +01:00
|
|
|
const bool mask;
|
2024-02-17 22:04:16 +01:00
|
|
|
const float scale;
|
|
|
|
const float max_bias;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-02-17 22:04:16 +01:00
|
|
|
return VARS_TO_STR5(type, ne, mask, scale, max_bias);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
// the 1024 test with bias occasionally fails:
|
|
|
|
// SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
|
|
|
|
virtual double max_nmse_err() override {
|
|
|
|
return 1e-6;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
2024-02-17 22:04:16 +01:00
|
|
|
bool mask = false,
|
2024-01-29 21:50:50 +01:00
|
|
|
float scale = 1.0f,
|
2024-02-17 22:04:16 +01:00
|
|
|
float max_bias = 0.0f)
|
|
|
|
: type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-02-17 22:04:16 +01:00
|
|
|
ggml_tensor * mask = nullptr;
|
|
|
|
if (this->mask) {
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(mask, "mask");
|
2024-02-17 22:04:16 +01:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-05-11 09:32:41 +02:00
|
|
|
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-07-17 13:23:50 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_ROPE
|
|
|
|
struct test_rope : public test_case {
|
|
|
|
const ggml_type type;
|
2024-05-29 19:17:31 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
2023-12-07 21:26:54 +01:00
|
|
|
int n_dims;
|
|
|
|
int mode;
|
2024-06-05 10:29:20 +02:00
|
|
|
int n_ctx; // used to generate positions
|
2024-05-29 19:17:31 +02:00
|
|
|
float fs; // freq_scale
|
|
|
|
float ef; // ext_factor
|
|
|
|
float af; // attn_factor
|
2024-05-22 10:01:35 +02:00
|
|
|
bool ff;
|
2024-05-29 19:17:31 +02:00
|
|
|
int v; // view (1 : non-contiguous a)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-29 19:17:31 +02:00
|
|
|
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
|
2024-05-29 19:17:31 +02:00
|
|
|
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
|
|
|
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-05-29 19:17:31 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "view_of_a");
|
2024-05-29 19:17:31 +02:00
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
2024-05-29 19:17:31 +02:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(pos, "pos");
|
|
|
|
|
|
|
|
ggml_tensor * freq = nullptr;
|
|
|
|
if (ff) {
|
|
|
|
freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
|
|
|
|
ggml_set_name(freq, "freq");
|
|
|
|
}
|
|
|
|
|
2024-06-05 10:29:20 +02:00
|
|
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// pos
|
2024-05-29 19:17:31 +02:00
|
|
|
std::vector<int> data(ne_a[2]);
|
|
|
|
for (int i = 0; i < ne_a[2]; i++) {
|
2023-12-07 21:26:54 +01:00
|
|
|
data[i] = rand() % n_ctx;
|
|
|
|
}
|
2024-05-29 19:17:31 +02:00
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-05-22 10:01:35 +02:00
|
|
|
if (t->ne[0] == n_dims/2) {
|
|
|
|
// frequency factors in the range [0.9f, 1.1f]
|
|
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
double max_maa_err() override {
|
|
|
|
return 1e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
// GGML_OP_POOL2D
|
|
|
|
struct test_pool2d : public test_case {
|
|
|
|
enum ggml_op_pool pool_type;
|
|
|
|
const ggml_type type_input;
|
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
// kernel size
|
|
|
|
const int k0;
|
|
|
|
const int k1;
|
|
|
|
// stride
|
|
|
|
const int s0;
|
|
|
|
const int s1;
|
|
|
|
// padding
|
|
|
|
const int p0;
|
|
|
|
const int p1;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
|
|
|
|
ggml_type type_input = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
int k0 = 3, int k1 = 3,
|
|
|
|
int s0 = 1, int s1 = 1,
|
|
|
|
int p0 = 1, int p1 = 1)
|
|
|
|
: pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, input);
|
|
|
|
ggml_set_name(input, "input");
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
// GGML_OP_CONV_TRANSPOSE_1D
|
|
|
|
struct test_conv_transpose_1d : public test_case {
|
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
|
|
|
2024-07-08 09:39:36 +02:00
|
|
|
const int s0; // stride
|
|
|
|
const int p0; // padding
|
|
|
|
const int d0; // dilation
|
2024-07-02 18:09:52 +02:00
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
|
|
int s0 = 1, int p0 = 0, int d0 = 1)
|
|
|
|
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(input, "input");
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(kernel, "kernel");
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_IM2COL
|
|
|
|
struct test_im2col : public test_case {
|
|
|
|
const ggml_type type_input;
|
|
|
|
const ggml_type type_kernel;
|
2024-01-31 14:10:15 +01:00
|
|
|
const ggml_type dst_type;
|
2023-12-07 21:26:54 +01:00
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
|
|
// stride
|
|
|
|
const int s0;
|
|
|
|
const int s1;
|
|
|
|
// padding
|
|
|
|
const int p0;
|
|
|
|
const int p1;
|
2024-07-02 18:09:52 +02:00
|
|
|
// dilation
|
2023-12-07 21:26:54 +01:00
|
|
|
const int d0;
|
|
|
|
const int d1;
|
|
|
|
// mode
|
|
|
|
const bool is_2D;
|
|
|
|
|
|
|
|
std::string vars() override {
|
2024-01-31 14:10:15 +01:00
|
|
|
return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
2023-12-07 21:26:54 +01:00
|
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
|
|
int s0 = 1, int s1 = 1,
|
|
|
|
int p0 = 1, int p1 = 1,
|
|
|
|
int d0 = 1, int d1 = 1,
|
|
|
|
bool is_2D = true)
|
2024-01-31 14:10:15 +01:00
|
|
|
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, input);
|
|
|
|
ggml_set_name(input, "input");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(kernel, "kernel");
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CONCAT
|
|
|
|
struct test_concat : public test_case {
|
|
|
|
const ggml_type type;
|
2024-05-28 10:04:19 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int64_t ne_b_d;
|
|
|
|
const int dim;
|
2024-05-29 14:38:26 +02:00
|
|
|
const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-29 14:38:26 +02:00
|
|
|
return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
|
|
|
|
int64_t ne_b_d = 5,
|
2024-05-29 14:38:26 +02:00
|
|
|
int dim = 2, int v = 0)
|
|
|
|
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-05-28 10:04:19 +02:00
|
|
|
auto ne_b = ne_a;
|
|
|
|
ne_b[dim] = ne_b_d;
|
2024-05-29 14:38:26 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-05-29 14:38:26 +02:00
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "view_of_a");
|
2024-05-29 14:38:26 +02:00
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
2024-05-29 14:38:26 +02:00
|
|
|
}
|
|
|
|
ggml_tensor * b;
|
|
|
|
if (v & 2) {
|
|
|
|
auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
|
|
|
|
b = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
2024-05-29 14:38:26 +02:00
|
|
|
b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(b, "view_of_b");
|
2024-05-29 14:38:26 +02:00
|
|
|
} else {
|
|
|
|
b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(b, "b");
|
2024-05-29 14:38:26 +02:00
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
2024-05-28 10:04:19 +02:00
|
|
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ARGSORT
|
|
|
|
struct test_argsort : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
ggml_sort_order order;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, order);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_argsort(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
2024-02-25 11:09:09 +01:00
|
|
|
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
2023-12-07 21:26:54 +01:00
|
|
|
: type(type), ne(ne), order(order) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_argsort(ctx, a, order);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
std::random_device rd;
|
|
|
|
std::default_random_engine rng(rd());
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// indices
|
|
|
|
std::vector<int> data(ggml_nelements(t));
|
|
|
|
for (int i = 0; i < ggml_nelements(t); i++) {
|
|
|
|
data[i] = rand();
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
|
|
|
|
} else if (t->type == GGML_TYPE_F32) {
|
|
|
|
// initialize with unique values to avoid ties
|
|
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
|
|
std::vector<float> data(t->ne[0]);
|
|
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
|
|
data[i] = i;
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
|
|
|
}
|
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
// GGML_OP_SUM
|
|
|
|
struct test_sum : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sum(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
ggml_tensor * out = ggml_sum(ctx, a);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_SUM_ROWS
|
|
|
|
struct test_sum_rows : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sum_rows(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
2023-12-07 21:26:54 +01:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_sum_rows(ctx, a);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_UPSCALE
|
|
|
|
struct test_upscale : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int32_t scale_factor;
|
2024-05-15 10:52:33 +02:00
|
|
|
const bool transpose;
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-15 10:52:33 +02:00
|
|
|
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
2023-12-13 20:54:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_upscale(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
2024-05-15 10:52:33 +02:00
|
|
|
int32_t scale_factor = 2, bool transpose = false)
|
|
|
|
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
if (transpose) {
|
|
|
|
a = ggml_transpose(ctx, a);
|
|
|
|
ggml_set_name(a, "a_transposed");
|
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-05-15 10:52:33 +02:00
|
|
|
// GGML_OP_UPSCALE (ext)
|
|
|
|
struct test_upscale_ext : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int64_t, 4> ne_tgt;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, ne_tgt);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
|
|
|
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
|
|
|
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-05-15 10:52:33 +02:00
|
|
|
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-05-15 10:52:33 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_GROUP_NORM
|
|
|
|
struct test_group_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int32_t num_groups;
|
2024-08-06 09:26:46 +02:00
|
|
|
const float eps;
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, num_groups);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_group_norm(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {64, 64, 320, 1},
|
2024-08-06 09:26:46 +02:00
|
|
|
int32_t num_groups = 32,
|
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-08-06 09:26:46 +02:00
|
|
|
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ACC
|
|
|
|
struct test_acc : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_acc(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
|
|
|
|
std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
|
2023-12-13 20:54:54 +01:00
|
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, a);
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, b);
|
|
|
|
ggml_set_name(b, "b");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_PAD
|
|
|
|
struct test_pad : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int pad_0;
|
|
|
|
const int pad_1;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_pad(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
|
|
|
|
int pad_0 = 1, int pad_1 = 1)
|
|
|
|
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-03-03 13:23:52 +01:00
|
|
|
// GGML_OP_ARANGE
|
|
|
|
struct test_arange : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const float start;
|
|
|
|
const float stop;
|
|
|
|
const float step;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, start, stop, step);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_arange(ggml_type type = GGML_TYPE_F32,
|
|
|
|
float start = 0.f, float stop = 10.f, float step = 1.f)
|
|
|
|
: type(type), start(start), stop(stop), step(step) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * out = ggml_arange(ctx, start, stop, step);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-03-03 13:23:52 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_TIMESTEP_EMBEDDING
|
|
|
|
struct test_timestep_embedding : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int dim;
|
|
|
|
const int max_period;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne_a, dim, max_period);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
|
|
|
|
int dim = 320, int max_period=10000)
|
|
|
|
: type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2024-03-03 13:23:52 +01:00
|
|
|
ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-03-03 13:23:52 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_LEAKY_RELU
|
|
|
|
struct test_leaky_relu : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const float negative_slope;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, negative_slope);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_leaky_relu(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
|
2023-12-13 20:54:54 +01:00
|
|
|
float negative_slope = 0.1f)
|
|
|
|
: type(type), ne_a(ne_a), negative_slope(negative_slope) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
// GGML_OP_FLASH_ATTN_EXT
|
|
|
|
struct test_flash_attn_ext : public test_case {
|
|
|
|
const int64_t hs; // head size
|
|
|
|
const int64_t nh; // num heads
|
|
|
|
const int64_t kv; // kv size
|
|
|
|
const int64_t nb; // batch size
|
|
|
|
|
2024-05-14 18:09:30 +02:00
|
|
|
const bool mask; // use mask
|
|
|
|
|
2024-05-11 09:32:41 +02:00
|
|
|
const float max_bias; // ALiBi
|
2024-08-24 21:34:59 +02:00
|
|
|
const float logit_softcap; // Gemma 2
|
2024-05-11 09:32:41 +02:00
|
|
|
|
2024-06-01 08:44:14 +02:00
|
|
|
const ggml_type type_KV;
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
std::string vars() override {
|
2024-08-24 21:34:59 +02:00
|
|
|
return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
|
|
|
|
bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
2024-08-24 21:34:59 +02:00
|
|
|
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-06-01 23:26:10 +02:00
|
|
|
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
|
|
|
|
|
|
|
|
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(q, "q");
|
|
|
|
|
2024-06-01 23:26:10 +02:00
|
|
|
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(k, "k");
|
|
|
|
|
2024-06-01 23:26:10 +02:00
|
|
|
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(v, "v");
|
|
|
|
|
|
|
|
ggml_tensor * m = nullptr;
|
|
|
|
if (mask) {
|
|
|
|
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
|
|
|
ggml_set_name(m, "m");
|
|
|
|
}
|
|
|
|
|
2024-08-24 21:34:59 +02:00
|
|
|
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
};
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
// GGML_OP_CROSS_ENTROPY_LOSS
|
|
|
|
struct test_cross_entropy_loss : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
|
2024-09-03 17:21:46 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
2024-08-27 21:01:45 +02:00
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_param(ctx, logits);
|
|
|
|
ggml_set_name(logits, "logits");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-09-03 17:21:46 +02:00
|
|
|
// The labels are assumed to be constant -> no gradients.
|
|
|
|
ggml_set_name(labels, "labels");
|
|
|
|
|
|
|
|
// Ensure labels add up to 1:
|
|
|
|
labels = ggml_soft_max(ctx, labels);
|
|
|
|
ggml_set_name(labels, "labels_normalized");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
|
2024-09-03 17:21:46 +02:00
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
return out;
|
|
|
|
}
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
// For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t, -100.0f, 100.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
float grad_eps() override {
|
|
|
|
return 1.0f;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
2024-08-27 21:01:45 +02:00
|
|
|
};
|
|
|
|
|
2024-09-20 18:04:44 +02:00
|
|
|
// GGML_OP_OPT_STEP_ADAMW
|
|
|
|
struct test_opt_step_adamw : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const float alpha;
|
|
|
|
const float beta1;
|
|
|
|
const float beta2;
|
|
|
|
const float eps;
|
|
|
|
const float wd;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 5, 4, 3},
|
|
|
|
float alpha = 1e-3f,
|
|
|
|
float beta1 = 0.9f,
|
|
|
|
float beta2 = 0.999f,
|
|
|
|
float eps = 1e-8f,
|
|
|
|
float wd = 0.0f)
|
|
|
|
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
|
|
ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
|
|
|
|
ggml_set_name(a, "a");
|
|
|
|
|
|
|
|
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd);
|
|
|
|
ggml_set_name(out, "out");
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool grad_precise() override {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-01-29 21:50:50 +01:00
|
|
|
enum llm_norm_type {
|
|
|
|
LLM_NORM,
|
|
|
|
LLM_NORM_RMS,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct llama_hparams {
|
|
|
|
uint32_t n_vocab;
|
|
|
|
uint32_t n_embd;
|
|
|
|
uint32_t n_head;
|
|
|
|
uint32_t n_head_kv;
|
|
|
|
static constexpr uint32_t n_layer = 1;
|
|
|
|
uint32_t n_rot;
|
|
|
|
uint32_t n_embd_head; // dimension of values (d_v)
|
|
|
|
uint32_t n_ff;
|
|
|
|
|
|
|
|
float f_norm_eps;
|
|
|
|
float f_norm_rms_eps;
|
|
|
|
|
|
|
|
// cparams
|
|
|
|
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
2024-06-05 10:29:20 +02:00
|
|
|
static constexpr uint32_t n_ctx_orig = n_ctx;
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
// batch
|
|
|
|
int32_t n_tokens;
|
|
|
|
|
|
|
|
// llm_build_context
|
|
|
|
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
|
|
|
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
|
|
|
|
|
|
|
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
|
|
|
return n_embd_head * n_head_kv;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// LLM base class
|
|
|
|
struct test_llm : public test_case {
|
|
|
|
llama_hparams hp;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
test_llm(llama_hparams hp)
|
|
|
|
: hp(std::move(hp)) {
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
struct ggml_tensor * llm_build_norm(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * cur,
|
|
|
|
struct ggml_tensor * mw,
|
|
|
|
struct ggml_tensor * mb,
|
|
|
|
llm_norm_type type) {
|
|
|
|
switch (type) {
|
|
|
|
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
|
|
|
|
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
|
|
|
|
}
|
|
|
|
cur = ggml_mul(ctx, cur, mw);
|
|
|
|
if (mb) {
|
|
|
|
cur = ggml_add(ctx, cur, mb);
|
|
|
|
}
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
void llm_build_kv_store(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * k_l,
|
|
|
|
struct ggml_tensor * v_l,
|
|
|
|
struct ggml_tensor * k_cur,
|
|
|
|
struct ggml_tensor * v_cur) {
|
|
|
|
// compute the transposed [n_tokens, n_embd] V matrix
|
|
|
|
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
|
|
|
|
|
|
|
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
|
|
|
|
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
|
|
|
|
|
|
|
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
|
|
|
|
( hp.n_ctx)*ggml_element_size(v_l),
|
|
|
|
(hp.kv_head)*ggml_element_size(v_l));
|
|
|
|
|
|
|
|
// important: storing RoPE-ed version of K in the KV cache!
|
|
|
|
ggml_cpy(ctx, k_cur, k_cache_view);
|
|
|
|
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * llm_build_kqv(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * k_l,
|
|
|
|
struct ggml_tensor * v_l,
|
|
|
|
struct ggml_tensor * q_cur,
|
|
|
|
struct ggml_tensor * kq_mask,
|
|
|
|
float kq_scale) {
|
|
|
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
|
|
|
|
|
|
|
struct ggml_tensor * k =
|
|
|
|
ggml_view_3d(ctx, k_l,
|
|
|
|
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
|
|
|
|
ggml_row_size(k_l->type, hp.n_embd_gqa()),
|
|
|
|
ggml_row_size(k_l->type, hp.n_embd_head),
|
|
|
|
0);
|
|
|
|
|
|
|
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
|
|
|
2024-05-11 09:32:41 +02:00
|
|
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
// split cached v into n_head heads
|
|
|
|
struct ggml_tensor * v =
|
|
|
|
ggml_view_3d(ctx, v_l,
|
|
|
|
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
|
|
|
|
ggml_element_size(v_l)*hp.n_ctx,
|
|
|
|
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
|
|
|
|
0);
|
|
|
|
|
|
|
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
|
|
|
|
|
|
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
|
|
|
|
|
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
|
|
|
|
|
|
|
|
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
|
|
cur = ggml_mul_mat(ctx, wo, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// pos
|
|
|
|
std::vector<int> data(hp.n_tokens);
|
|
|
|
for (int i = 0; i < hp.n_tokens; i++) {
|
|
|
|
data[i] = rand() % hp.n_ctx;
|
|
|
|
}
|
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Llama
|
|
|
|
struct test_llama : public test_llm {
|
|
|
|
static constexpr float freq_base = 10000.0f;
|
|
|
|
static constexpr float freq_scale = 1.0f;
|
|
|
|
static constexpr float ext_factor = 0.0f;
|
|
|
|
static constexpr float attn_factor = 1.0f;
|
|
|
|
static constexpr float beta_fast = 32.0f;
|
|
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
return "LLAMA";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
auto n_tokens = hp.n_tokens;
|
|
|
|
return VARS_TO_STR1(n_tokens);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 2e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_llama(int n_tokens = 1)
|
|
|
|
: test_llm({
|
|
|
|
/*n_vocab =*/ 32000,
|
|
|
|
/*n_embd =*/ 3200,
|
|
|
|
/*n_head =*/ 32,
|
|
|
|
/*n_head_kv =*/ 32,
|
|
|
|
/*n_rot =*/ 100,
|
|
|
|
/*n_embd_head =*/ 100,
|
|
|
|
/*n_ff =*/ 8640,
|
|
|
|
/*f_norm_eps =*/ 0.f,
|
|
|
|
/*f_norm_rms_eps =*/ 1e-5f,
|
|
|
|
/*n_tokens =*/ n_tokens,
|
|
|
|
}) {
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
struct ggml_tensor * cur;
|
|
|
|
struct ggml_tensor * inpL;
|
|
|
|
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
|
|
|
|
// inp_pos - contains the positions
|
|
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
|
|
struct ggml_tensor * inpSA = inpL;
|
|
|
|
|
|
|
|
// norm
|
|
|
|
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
// self-attention
|
|
|
|
{
|
|
|
|
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
|
|
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
|
|
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
|
|
|
|
|
|
// compute Q and K and RoPE them
|
|
|
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
|
|
|
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
|
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Qcur = ggml_rope_ext(
|
|
|
|
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
|
2024-06-05 10:29:20 +02:00
|
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
2024-01-29 21:50:50 +01:00
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Kcur = ggml_rope_ext(
|
|
|
|
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
|
2024-06-05 10:29:20 +02:00
|
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
2024-01-29 21:50:50 +01:00
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
|
|
|
|
|
|
|
|
// feed-forward network
|
|
|
|
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_gate, cur);
|
|
|
|
cur = ggml_silu(ctx, cur);
|
|
|
|
cur = ggml_mul(ctx, cur, tmp);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
|
|
|
|
// input for next layer
|
|
|
|
inpL = cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = inpL;
|
|
|
|
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
// lm_head
|
|
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
|
|
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Falcon
|
|
|
|
struct test_falcon : public test_llm {
|
|
|
|
static constexpr float freq_base = 10000.0f;
|
|
|
|
static constexpr float freq_scale = 1.0f;
|
|
|
|
static constexpr float ext_factor = 0.0f;
|
|
|
|
static constexpr float attn_factor = 1.0f;
|
|
|
|
static constexpr float beta_fast = 32.0f;
|
|
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
return "FALCON";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
auto n_tokens = hp.n_tokens;
|
|
|
|
return VARS_TO_STR1(n_tokens);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 2e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_falcon(int n_tokens = 1)
|
|
|
|
: test_llm({
|
|
|
|
/*n_vocab =*/ 32000,
|
|
|
|
/*n_embd =*/ 3200,
|
|
|
|
/*n_head =*/ 50,
|
|
|
|
/*n_head_kv =*/ 1,
|
|
|
|
/*n_rot =*/ 64,
|
|
|
|
/*n_embd_head =*/ 64,
|
|
|
|
/*n_ff =*/ 8640,
|
|
|
|
/*f_norm_eps =*/ 1e-5f,
|
|
|
|
/*f_norm_rms_eps =*/ 0.f,
|
|
|
|
/*n_tokens =*/ n_tokens,
|
|
|
|
}) {
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
struct ggml_tensor * cur;
|
|
|
|
struct ggml_tensor * inpL;
|
|
|
|
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
|
|
|
|
// inp_pos - contains the positions
|
|
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
|
|
// norm
|
|
|
|
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
|
|
|
|
|
|
|
|
// self-attention
|
|
|
|
{
|
|
|
|
cur = attn_norm;
|
|
|
|
|
|
|
|
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
|
|
|
|
|
|
|
|
cur = ggml_mul_mat(ctx, wqkv, cur);
|
|
|
|
|
|
|
|
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
|
|
|
|
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
|
|
|
|
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
|
|
|
|
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
|
|
|
|
|
|
|
// using mode = 2 for neox mode
|
2024-05-21 22:28:32 +02:00
|
|
|
Qcur = ggml_rope_ext(
|
2024-06-05 10:29:20 +02:00
|
|
|
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
2024-01-29 21:50:50 +01:00
|
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Kcur = ggml_rope_ext(
|
2024-06-05 10:29:20 +02:00
|
|
|
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
2024-01-29 21:50:50 +01:00
|
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * ffn_inp = cur;
|
|
|
|
|
|
|
|
// feed forward
|
|
|
|
{
|
|
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
|
|
cur = attn_norm;
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_up, cur);
|
|
|
|
cur = ggml_gelu(ctx, cur);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, inpL);
|
|
|
|
|
|
|
|
// input for next layer
|
|
|
|
inpL = cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = inpL;
|
|
|
|
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
|
|
|
|
|
|
|
|
// lm_head
|
|
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
|
|
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
|
|
|
|
// ###########################################
|
|
|
|
// ## Section 3: GGML Op Test Instantiation ##
|
|
|
|
// ###########################################
|
|
|
|
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
|
|
|
std::vector<std::unique_ptr<test_case>> test_cases;
|
2024-01-09 08:58:55 +01:00
|
|
|
std::default_random_engine rng(0);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
const ggml_type all_types[] = {
|
2024-05-08 08:30:09 +02:00
|
|
|
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
2023-12-13 13:04:25 +01:00
|
|
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
|
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
|
|
GGML_TYPE_Q8_0,
|
|
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
|
|
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
2024-01-17 17:54:56 +01:00
|
|
|
GGML_TYPE_Q6_K,
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
2024-02-26 17:28:38 +01:00
|
|
|
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
2024-03-26 15:21:27 +01:00
|
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
2024-02-27 15:34:24 +01:00
|
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
2023-12-13 13:04:25 +01:00
|
|
|
};
|
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
const ggml_type base_types[] = {
|
|
|
|
GGML_TYPE_F32, GGML_TYPE_F16,
|
|
|
|
GGML_TYPE_Q4_0,
|
|
|
|
GGML_TYPE_Q4_K,
|
|
|
|
GGML_TYPE_IQ2_XXS
|
|
|
|
};
|
|
|
|
|
|
|
|
const ggml_type other_types[] = {
|
|
|
|
GGML_TYPE_Q4_1,
|
|
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
|
|
GGML_TYPE_Q8_0,
|
|
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
|
|
GGML_TYPE_Q5_K,
|
|
|
|
GGML_TYPE_Q6_K,
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
2024-04-18 15:18:48 +02:00
|
|
|
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
|
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
|
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
2024-07-02 08:39:38 +02:00
|
|
|
GGML_TYPE_BF16,
|
2024-04-18 15:18:48 +02:00
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// unary ops
|
2024-06-12 15:00:22 +02:00
|
|
|
for (int v : {0, 1}) {
|
|
|
|
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v));
|
|
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v));
|
2024-06-12 15:00:22 +02:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
|
|
|
for (ggml_type type : all_types) {
|
|
|
|
for (int b : {1, 7}) {
|
|
|
|
for (bool v : {false, true}) {
|
|
|
|
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
2023-12-29 18:07:03 +01:00
|
|
|
for (int b : {1, 7}) {
|
|
|
|
for (bool v : {false, true}) {
|
|
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
|
|
|
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
|
|
|
for (int k0 : {1, 3}) {
|
|
|
|
for (int k1 : {1, 3}) {
|
|
|
|
for (int s0 : {1, 2}) {
|
|
|
|
for (int s1 : {1, 2}) {
|
|
|
|
for (int p0 : {0, 1}) {
|
|
|
|
for (int p1 : {0, 1}) {
|
|
|
|
test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
|
2024-01-31 14:10:15 +01:00
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
2024-08-02 10:50:53 +02:00
|
|
|
// test cases for 1D im2col
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
2024-08-02 10:50:53 +02:00
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
2024-01-31 14:10:15 +01:00
|
|
|
|
2024-08-20 17:06:51 +02:00
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
|
|
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
|
|
|
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
|
|
// these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d());
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
|
|
|
|
2024-09-20 18:04:44 +02:00
|
|
|
for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-29 18:07:03 +01:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
2023-12-29 18:07:03 +01:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
|
|
|
|
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
|
|
|
|
}
|
|
|
|
|
2024-01-29 13:37:33 +01:00
|
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
for (ggml_type type_dst : all_types) {
|
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
2024-01-29 13:37:33 +01:00
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_cont());
|
2024-08-28 10:23:02 +02:00
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
|
|
|
|
for (auto op : {ggml_add, ggml_mul, ggml_div}) {
|
|
|
|
test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
|
2023-12-13 13:04:25 +01:00
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
|
2023-12-07 21:26:54 +01:00
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
|
2024-09-03 17:21:46 +02:00
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2});
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// stable diffusion
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
|
2023-12-13 13:04:25 +01:00
|
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
|
|
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_add1());
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_scale());
|
|
|
|
|
|
|
|
for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
|
|
|
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-08-26 16:55:36 +02:00
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
|
|
|
|
|
2024-09-22 04:29:12 +02:00
|
|
|
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
|
|
|
|
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
|
|
|
|
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
|
|
|
|
test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
|
|
|
|
|
2024-07-19 17:17:27 +02:00
|
|
|
#if 1
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : base_types) {
|
2023-12-29 09:32:31 +01:00
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
|
|
|
}
|
|
|
|
}
|
2024-07-19 17:17:27 +02:00
|
|
|
#else
|
|
|
|
// m = a rows
|
|
|
|
// n = b rows
|
|
|
|
// k = cols
|
|
|
|
std::uniform_int_distribution<> dist_m(1, 128);
|
|
|
|
std::uniform_int_distribution<> dist_n(16, 128);
|
|
|
|
std::uniform_int_distribution<> dist_k(1, 16);
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
for (ggml_type type_a : all_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
|
|
int m = dist_m(rng);
|
|
|
|
int n = dist_n(rng);
|
|
|
|
int k = dist_k(rng) * ggml_blck_size(type_a);
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : other_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
2024-08-05 07:52:55 +02:00
|
|
|
if (ggml_blck_size(type_a) != 256) {
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
|
|
|
|
}
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
|
2024-04-18 15:18:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-22 08:36:03 +01:00
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
|
|
|
|
2024-08-20 17:06:51 +02:00
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
|
|
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
|
|
|
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
|
|
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
|
|
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
|
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : base_types) {
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
2024-04-18 15:18:48 +02:00
|
|
|
for (int n_mats : {4, 8}) {
|
|
|
|
for (int n_used : {1, 2, 4}) {
|
|
|
|
for (bool b : {false, true}) {
|
|
|
|
for (int n : {1, 32}) {
|
|
|
|
int m = 512;
|
|
|
|
int k = 256;
|
|
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (ggml_type type_a : other_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
|
|
|
for (int n_mats : {4}) {
|
|
|
|
for (int n_used : {2}) {
|
|
|
|
for (bool b : {false}) {
|
|
|
|
for (int n : {1}) {
|
|
|
|
int m = 512;
|
|
|
|
int k = 256;
|
|
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-20 18:04:44 +02:00
|
|
|
for (ggml_type type_a : base_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
|
|
test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_sqr());
|
2024-06-17 00:23:04 +02:00
|
|
|
test_cases.emplace_back(new test_sqrt());
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_log());
|
2024-08-27 21:01:45 +02:00
|
|
|
test_cases.emplace_back(new test_sin());
|
|
|
|
test_cases.emplace_back(new test_cos());
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_clamp());
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-02-17 22:04:16 +01:00
|
|
|
#if 0
|
2024-01-09 08:58:55 +01:00
|
|
|
std::uniform_int_distribution<> dist_ne1(1, 50);
|
|
|
|
int exponent = 1;
|
|
|
|
while (exponent < (1 << 17)) {
|
|
|
|
std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
|
|
|
|
|
|
|
|
for (int n = 0; n < 10; ++n) {
|
|
|
|
int64_t ne0 = dist_ne0(rng);
|
|
|
|
int64_t ne1 = dist_ne1(rng);
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
2024-01-09 08:58:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
exponent <<= 1;
|
|
|
|
}
|
2024-02-17 22:04:16 +01:00
|
|
|
#endif
|
|
|
|
for (bool mask : {false, true}) {
|
|
|
|
for (float max_bias : {0.0f, 8.0f}) {
|
2024-05-11 09:32:41 +02:00
|
|
|
if (!mask && max_bias > 0.0f) continue;
|
2024-02-17 22:04:16 +01:00
|
|
|
for (float scale : {1.0f, 0.1f}) {
|
|
|
|
for (int64_t ne0 : {16, 1024}) {
|
|
|
|
for (int64_t ne1 : {16, 1024}) {
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
|
2024-02-17 22:04:16 +01:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
|
2024-02-17 22:04:16 +01:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
2024-01-29 21:50:50 +01:00
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
{
|
|
|
|
bool all = true;
|
|
|
|
|
|
|
|
for (float v : { 0, 1 }) {
|
|
|
|
for (float fs : { 1.0f, 1.4245f }) {
|
|
|
|
for (float ef : { 0.0f, 0.7465f }) {
|
|
|
|
for (float af : { 1.0f, 1.4245f }) {
|
|
|
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
|
|
for (bool ff : {false, true}) { // freq_factors
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
|
2024-06-05 10:29:20 +02:00
|
|
|
|
|
|
|
if (all) {
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
|
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
|
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
|
2024-06-05 10:29:20 +02:00
|
|
|
}
|
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
if (all) {
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
2024-05-29 19:17:31 +02:00
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
2024-05-29 19:17:31 +02:00
|
|
|
}
|
|
|
|
}
|
2024-06-05 10:29:20 +02:00
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
all = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-05-22 10:01:35 +02:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-05-29 14:38:26 +02:00
|
|
|
for (int v : { 0, 1, 2, 3 }) {
|
|
|
|
for (int dim : { 0, 1, 2, 3, }) {
|
|
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
|
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
|
|
|
}
|
2024-05-28 10:04:19 +02:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-02-25 11:09:09 +01:00
|
|
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
2023-12-13 13:04:25 +01:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
2024-04-03 15:07:05 +02:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
test_cases.emplace_back(new test_sum());
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_sum_rows());
|
|
|
|
test_cases.emplace_back(new test_upscale());
|
2024-05-15 10:52:33 +02:00
|
|
|
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
|
|
|
test_cases.emplace_back(new test_upscale_ext());
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_group_norm());
|
|
|
|
test_cases.emplace_back(new test_acc());
|
|
|
|
test_cases.emplace_back(new test_pad());
|
2024-03-03 13:23:52 +01:00
|
|
|
test_cases.emplace_back(new test_arange());
|
|
|
|
test_cases.emplace_back(new test_timestep_embedding());
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_leaky_relu());
|
2023-12-13 13:04:25 +01:00
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
for (int hs : { 64, 80, 128, 256, }) {
|
2024-05-14 18:09:30 +02:00
|
|
|
for (bool mask : { true, false } ) {
|
|
|
|
for (float max_bias : { 0.0f, 8.0f }) {
|
|
|
|
if (!mask && max_bias > 0.0f) continue;
|
2024-08-24 21:34:59 +02:00
|
|
|
for (float logit_softcap : {0.0f, 10.0f}) {
|
|
|
|
if (hs != 128 && logit_softcap != 0.0f) continue;
|
|
|
|
for (int nh : { 32, }) {
|
|
|
|
for (int kv : { 512, 1024, }) {
|
|
|
|
for (int nb : { 1, 2, 4, 8, }) {
|
|
|
|
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
|
|
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
|
|
|
}
|
2024-06-01 08:44:14 +02:00
|
|
|
}
|
2024-05-14 18:09:30 +02:00
|
|
|
}
|
2024-05-11 09:32:41 +02:00
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
test_cases.emplace_back(new test_cross_entropy_loss());
|
2024-09-20 18:04:44 +02:00
|
|
|
for (float wd : {0.0f, 1e-2f}) {
|
|
|
|
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
|
|
|
|
}
|
2024-08-27 21:01:45 +02:00
|
|
|
|
2024-02-13 10:20:24 +01:00
|
|
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
|
|
#if 0
|
2024-01-29 21:50:50 +01:00
|
|
|
test_cases.emplace_back(new test_llama(1));
|
|
|
|
test_cases.emplace_back(new test_llama(2));
|
|
|
|
test_cases.emplace_back(new test_falcon(1));
|
|
|
|
test_cases.emplace_back(new test_falcon(2));
|
|
|
|
#endif
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// run tests
|
2024-09-03 17:21:46 +02:00
|
|
|
if (mode == MODE_GRAD) {
|
|
|
|
size_t n_ok = 0;
|
|
|
|
for (auto & test : test_cases) {
|
|
|
|
if (test->eval_grad(backend, op_name)) {
|
|
|
|
n_ok++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
|
|
|
|
|
|
|
|
return n_ok == test_cases.size();
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
if (mode == MODE_TEST) {
|
|
|
|
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
|
|
|
|
|
|
|
size_t n_ok = 0;
|
|
|
|
for (auto & test : test_cases) {
|
|
|
|
if (test->eval(backend, backend_cpu, op_name)) {
|
|
|
|
n_ok++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
|
|
|
|
|
|
|
|
ggml_backend_free(backend_cpu);
|
|
|
|
|
|
|
|
return n_ok == test_cases.size();
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (mode == MODE_PERF) {
|
2023-12-07 21:26:54 +01:00
|
|
|
for (auto & test : test_cases) {
|
|
|
|
test->eval_perf(backend, op_name);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void usage(char ** argv) {
|
|
|
|
printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
|
2024-09-03 17:21:46 +02:00
|
|
|
printf(" valid modes:\n");
|
|
|
|
printf(" - test (default, compare with CPU backend for correctness)\n");
|
|
|
|
printf(" - perf (performance evaluation)\n");
|
|
|
|
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
|
|
|
printf(" op names are as given by ggml_op_desc() (e.g. GGML_ADD)\n");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
test_mode mode = MODE_TEST;
|
2024-03-13 14:58:30 +01:00
|
|
|
const char * op_name_filter = NULL;
|
|
|
|
const char * backend_filter = NULL;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
if (strcmp(argv[i], "test") == 0) {
|
|
|
|
mode = MODE_TEST;
|
|
|
|
} else if (strcmp(argv[i], "perf") == 0) {
|
|
|
|
mode = MODE_PERF;
|
2024-09-03 17:21:46 +02:00
|
|
|
} else if (strcmp(argv[i], "grad") == 0) {
|
|
|
|
mode = MODE_GRAD;
|
2023-12-07 21:26:54 +01:00
|
|
|
} else if (strcmp(argv[i], "-o") == 0) {
|
|
|
|
if (i + 1 < argc) {
|
2024-03-13 14:58:30 +01:00
|
|
|
op_name_filter = argv[++i];
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
} else if (strcmp(argv[i], "-b") == 0) {
|
|
|
|
if (i + 1 < argc) {
|
2024-03-13 14:58:30 +01:00
|
|
|
backend_filter = argv[++i];
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// enumerate backends
|
|
|
|
printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
|
|
|
|
|
|
|
|
size_t n_ok = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
|
|
|
|
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
|
|
|
|
|
2024-03-13 14:58:30 +01:00
|
|
|
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
|
2023-12-07 21:26:54 +01:00
|
|
|
printf(" Skipping\n");
|
|
|
|
n_ok++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
|
|
|
|
GGML_ASSERT(backend != NULL);
|
2024-03-13 14:58:30 +01:00
|
|
|
|
2024-09-03 17:21:46 +02:00
|
|
|
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
2024-03-13 14:58:30 +01:00
|
|
|
printf(" Skipping CPU backend\n");
|
|
|
|
ggml_backend_free(backend);
|
|
|
|
n_ok++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
printf(" Backend name: %s\n", ggml_backend_name(backend));
|
|
|
|
|
2024-03-13 14:58:30 +01:00
|
|
|
bool ok = test_backend(backend, mode, op_name_filter);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
printf(" Backend %s: ", ggml_backend_name(backend));
|
|
|
|
if (ok) {
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
n_ok++;
|
|
|
|
} else {
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\n");
|
|
|
|
|
|
|
|
ggml_backend_free(backend);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
if (n_ok != ggml_backend_reg_get_count()) {
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
return 1;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-01-17 17:54:56 +01:00
|
|
|
ggml_quantize_free();
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
return 0;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|