2023-12-07 21:26:54 +01:00
|
|
|
#include <ggml.h>
|
|
|
|
#include <ggml-alloc.h>
|
|
|
|
#include <ggml-backend.h>
|
2024-05-12 19:40:45 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
#include <algorithm>
|
|
|
|
#include <array>
|
|
|
|
#include <cfloat>
|
|
|
|
#include <cstring>
|
|
|
|
#include <functional>
|
|
|
|
#include <memory>
|
|
|
|
#include <random>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string>
|
|
|
|
#include <thread>
|
|
|
|
#include <vector>
|
|
|
|
|
2024-05-18 02:39:54 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
2024-01-17 17:54:56 +01:00
|
|
|
// static RNG initialization (revisit if n_threads stops being constant)
|
|
|
|
static const size_t n_threads = std::thread::hardware_concurrency();
|
|
|
|
static std::vector<std::default_random_engine> generators = []() {
|
|
|
|
std::random_device rd;
|
|
|
|
std::vector<std::default_random_engine> vec;
|
|
|
|
vec.reserve(n_threads);
|
|
|
|
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
|
|
|
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
|
|
|
return vec;
|
|
|
|
}();
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t size = ggml_nelements(tensor);
|
|
|
|
std::vector<float> data(size);
|
|
|
|
|
2024-01-17 17:54:56 +01:00
|
|
|
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
2023-12-07 21:26:54 +01:00
|
|
|
std::uniform_real_distribution<float> distribution(min, max);
|
|
|
|
for (size_t i = start; i < end; i++) {
|
2024-01-17 17:54:56 +01:00
|
|
|
data[i] = distribution(generators[ith]);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<std::thread> threads;
|
|
|
|
threads.reserve(n_threads);
|
|
|
|
for (size_t i = 0; i < n_threads; i++) {
|
|
|
|
size_t start = i*size/n_threads;
|
|
|
|
size_t end = (i+1)*size/n_threads;
|
2024-01-17 17:54:56 +01:00
|
|
|
threads.emplace_back(init_thread, i, start, end);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
for (auto & t : threads) {
|
|
|
|
t.join();
|
|
|
|
}
|
|
|
|
|
2024-05-18 02:39:54 +02:00
|
|
|
#if 0
|
|
|
|
const char * val_str = getenv("GGML_TEST_EPS");
|
|
|
|
float val = 1e-9f;
|
|
|
|
if (val_str != nullptr) {
|
|
|
|
val = std::stof(val_str);
|
|
|
|
printf("GGML_TEST_EPS=%e\n", val);
|
|
|
|
}
|
|
|
|
|
|
|
|
// test quantization with very small values that may result in nan scales due to division by zero
|
|
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
|
|
for (int i = 0; i < 256; i++) {
|
|
|
|
data[i] = val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
2024-05-08 08:30:09 +02:00
|
|
|
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
2023-12-07 21:26:54 +01:00
|
|
|
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
2023-12-14 20:05:21 +01:00
|
|
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
2024-01-17 17:54:56 +01:00
|
|
|
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
|
|
|
const float * im = imatrix.data();
|
|
|
|
if (!ggml_quantize_requires_imatrix(tensor->type)) {
|
|
|
|
// when the imatrix is optional, we want to test both quantization with and without imatrix
|
|
|
|
// use one of the random numbers to decide
|
|
|
|
if (data[0] > 0.5f*(min + max)) {
|
|
|
|
im = nullptr;
|
|
|
|
}
|
|
|
|
}
|
2024-07-19 17:17:27 +02:00
|
|
|
|
2024-03-09 14:53:59 +01:00
|
|
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
2024-05-18 02:39:54 +02:00
|
|
|
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
2024-07-19 17:17:27 +02:00
|
|
|
// TODO: other cases
|
|
|
|
//#pragma omp parallel for
|
|
|
|
//for (int i = 0; i < tensor->ne[1]; i++) {
|
|
|
|
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
|
|
|
// i * tensor->ne[0], 1, tensor->ne[0], im);
|
|
|
|
//}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
2023-12-29 18:07:03 +01:00
|
|
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
|
|
|
// This is going to create some weird integers though.
|
|
|
|
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
|
std::vector<float> tv;
|
|
|
|
tv.reserve(ggml_nelements(t));
|
|
|
|
|
|
|
|
std::vector<uint8_t> buf(ggml_nbytes(t));
|
|
|
|
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
|
|
|
|
size_t bs = ggml_blck_size(t->type);
|
2023-12-14 20:05:21 +01:00
|
|
|
std::vector<float> vq(ggml_blck_size(t->type));
|
|
|
|
bool quantized = ggml_is_quantized(t->type);
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// access elements by index to avoid gaps in views
|
|
|
|
for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
|
|
|
|
for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
|
|
|
|
for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
|
2023-12-13 13:04:25 +01:00
|
|
|
for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
|
|
|
|
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
2023-12-07 21:26:54 +01:00
|
|
|
if (t->type == GGML_TYPE_F16) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
2024-05-08 08:30:09 +02:00
|
|
|
} else if (t->type == GGML_TYPE_BF16) {
|
|
|
|
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else if (t->type == GGML_TYPE_F32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back(*(float *) &buf[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
} else if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.push_back((float)*(int32_t *) &buf[i]);
|
2023-12-29 18:07:03 +01:00
|
|
|
} else if (t->type == GGML_TYPE_I16) {
|
|
|
|
tv.push_back((float)*(int16_t *) &buf[i]);
|
|
|
|
} else if (t->type == GGML_TYPE_I8) {
|
|
|
|
tv.push_back((float)*(int8_t *) &buf[i]);
|
2023-12-14 20:05:21 +01:00
|
|
|
} else if (quantized) {
|
2024-04-18 15:18:48 +02:00
|
|
|
tt.to_float(&buf[i], vq.data(), bs);
|
2023-12-13 13:04:25 +01:00
|
|
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return tv;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
static double cosine_similarity(const float * v1, const float * v2, size_t n) {
|
|
|
|
double dot = 0.0;
|
|
|
|
double mag1 = 0.0;
|
|
|
|
double mag2 = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v1[i]) || std::isnan(v2[i])) {
|
|
|
|
return -1.0f;
|
|
|
|
}
|
|
|
|
if (std::isinf(v1[i]) && std::isinf(v2[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
dot += v1[i]*v2[i];
|
|
|
|
mag1 += v1[i]*v1[i];
|
|
|
|
mag2 += v2[i]*v2[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
return dot/sqrt(mag1*mag2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static float distance(const float * v1, const float * v2, size_t n) {
|
|
|
|
double d = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v1[i]) || std::isnan(v2[i])) {
|
|
|
|
return INFINITY;
|
|
|
|
}
|
|
|
|
if (std::isinf(v1[i]) && std::isinf(v2[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
d += (v1[i] - v2[i])*(v1[i] - v2[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return sqrt(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
static float vec_len(const float * v, size_t n) {
|
|
|
|
double d = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (std::isnan(v[i])) {
|
|
|
|
return INFINITY;
|
|
|
|
}
|
|
|
|
if (std::isinf(v[i])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
d += v[i]*v[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
return sqrt(d);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
|
|
|
static double nmse(const float * a, const float * b, size_t n) {
|
|
|
|
double mse_a_b = 0.0;
|
|
|
|
double mse_a_0 = 0.0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
float a_i = a[i];
|
|
|
|
float b_i = b[i];
|
|
|
|
|
|
|
|
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
|
|
|
mse_a_0 += a_i * a_i;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mse_a_b / mse_a_0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// utils for printing the variables of the test cases
|
|
|
|
#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
static std::string var_to_str(const T & x) {
|
|
|
|
return std::to_string(x);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T, size_t N>
|
|
|
|
static std::string var_to_str(const T (&x)[N]) {
|
|
|
|
std::string s = "[";
|
|
|
|
for (size_t i = 0; i < N; i++) {
|
|
|
|
if (i > 0) {
|
|
|
|
s += ",";
|
|
|
|
}
|
|
|
|
s += var_to_str(x[i]);
|
|
|
|
}
|
|
|
|
s += "]";
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T, size_t N>
|
|
|
|
static std::string var_to_str(const std::array<T, N> & x) {
|
|
|
|
std::string s = "[";
|
|
|
|
for (size_t i = 0; i < N; i++) {
|
|
|
|
if (i > 0) {
|
|
|
|
s += ",";
|
|
|
|
}
|
|
|
|
s += var_to_str(x[i]);
|
|
|
|
}
|
|
|
|
s += "]";
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
//static std::string var_to_str(ggml_unary_op unary_op) {
|
|
|
|
// return ggml_unary_op_name(unary_op);
|
|
|
|
//}
|
|
|
|
|
|
|
|
static std::string var_to_str(ggml_type type) {
|
|
|
|
return ggml_type_name(type);
|
|
|
|
}
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
static std::string var_to_str(ggml_op_pool pool) {
|
|
|
|
switch (pool) {
|
|
|
|
case GGML_OP_POOL_AVG: return "avg";
|
|
|
|
case GGML_OP_POOL_MAX: return "max";
|
|
|
|
default: return std::to_string(pool);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
#define VARS_TO_STR1(a) VAR_TO_STR(a)
|
|
|
|
#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
|
|
|
|
#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
|
|
|
|
#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
|
|
|
|
#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
|
|
|
|
#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
|
|
|
|
#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
|
|
|
|
#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
|
|
|
|
#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
|
|
|
|
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
|
|
|
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
2024-01-31 14:10:15 +01:00
|
|
|
#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 16:56:23 +01:00
|
|
|
#ifdef GGML_USE_SYCL
|
|
|
|
static bool inline _isinf(float f) {
|
|
|
|
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static bool inline _isinf(float f) { return std::isinf(f); }
|
|
|
|
#endif
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// accept FLT_MAX as infinity
|
|
|
|
static bool isinf_or_max(float f) {
|
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 16:56:23 +01:00
|
|
|
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool ggml_is_view_op(enum ggml_op op) {
|
|
|
|
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
enum test_mode {
|
|
|
|
MODE_TEST,
|
|
|
|
MODE_PERF,
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
struct test_case {
|
|
|
|
virtual ~test_case() {}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
virtual std::string op_desc(ggml_tensor * t) {
|
|
|
|
return ggml_op_desc(t);
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
virtual std::string vars() {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
|
|
|
|
|
|
|
|
virtual double max_nmse_err() {
|
2023-12-13 13:04:25 +01:00
|
|
|
return 1e-7;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
virtual void initialize_tensors(ggml_context * ctx) {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual size_t op_size(ggml_tensor * t) {
|
|
|
|
size_t size = ggml_nbytes(t);
|
|
|
|
// add source tensors
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
|
|
if (t->src[i] != NULL) {
|
|
|
|
size += ggml_nbytes(t->src[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_cgraph * gf = nullptr;
|
|
|
|
|
|
|
|
static const int sentinel_size = 1024;
|
|
|
|
|
|
|
|
test_mode mode;
|
|
|
|
|
|
|
|
std::vector<ggml_tensor *> sentinels;
|
|
|
|
|
|
|
|
void add_sentinel(ggml_context * ctx) {
|
|
|
|
if (mode == MODE_PERF) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
|
|
|
|
ggml_format_name(sentinel, "sent_%zu", sentinels.size());
|
|
|
|
sentinels.push_back(sentinel);
|
|
|
|
}
|
|
|
|
|
|
|
|
// hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
|
|
|
ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
|
|
|
|
add_sentinel(ctx);
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
|
2023-12-13 20:54:54 +01:00
|
|
|
mode = MODE_TEST;
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_init_params params = {
|
|
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
|
|
|
/* .mem_base = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
ggml_context * ctx = ggml_init(params);
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
gf = ggml_new_graph(ctx);
|
|
|
|
|
|
|
|
// pre-graph sentinel
|
|
|
|
add_sentinel(ctx);
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
fflush(stdout);
|
|
|
|
|
2024-01-29 21:50:50 +01:00
|
|
|
// check if the backends support the ops
|
2023-12-29 09:32:31 +01:00
|
|
|
bool supported = true;
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_backend_t backend : {backend1, backend2}) {
|
2024-01-29 21:50:50 +01:00
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (!ggml_backend_supports_op(backend, t)) {
|
|
|
|
printf("not supported [%s] ", ggml_backend_name(backend));
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
2023-12-29 09:32:31 +01:00
|
|
|
if (!supported) {
|
|
|
|
printf("\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// post-graph sentinel
|
|
|
|
add_sentinel(ctx);
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// allocate
|
|
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
|
2024-01-12 20:07:38 +01:00
|
|
|
if (buf == NULL) {
|
|
|
|
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
|
|
|
|
ggml_free(ctx);
|
|
|
|
return false;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// build graph
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// add sentinels as graph nodes so that they are checked in the callback
|
|
|
|
for (ggml_tensor * sentinel : sentinels) {
|
|
|
|
gf->nodes[gf->n_nodes++] = sentinel;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// randomize tensors
|
|
|
|
initialize_tensors(ctx);
|
|
|
|
|
|
|
|
// compare
|
|
|
|
struct callback_userdata {
|
|
|
|
bool ok;
|
|
|
|
double max_err;
|
2024-01-04 09:43:23 +01:00
|
|
|
ggml_backend_t backend1;
|
|
|
|
ggml_backend_t backend2;
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
callback_userdata ud {
|
|
|
|
true,
|
|
|
|
max_nmse_err(),
|
2024-01-04 09:43:23 +01:00
|
|
|
backend1,
|
|
|
|
backend2
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
|
2023-12-13 20:54:54 +01:00
|
|
|
callback_userdata * ud = (callback_userdata *) user_data;
|
2024-01-04 09:43:23 +01:00
|
|
|
const char * bn1 = ggml_backend_name(ud->backend1);
|
|
|
|
const char * bn2 = ggml_backend_name(ud->backend2);
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
if (t1->op == GGML_OP_NONE) {
|
|
|
|
// sentinels must be unchanged
|
|
|
|
std::vector<uint8_t> t1_data(ggml_nbytes(t1));
|
|
|
|
std::vector<uint8_t> t2_data(ggml_nbytes(t2));
|
|
|
|
ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
|
|
|
|
ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
|
|
|
|
|
|
|
|
if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
|
|
|
|
printf("sentinel mismatch: %s ", t1->name);
|
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
std::vector<float> f1 = tensor_to_float(t1);
|
|
|
|
std::vector<float> f2 = tensor_to_float(t2);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < f1.size(); i++) {
|
|
|
|
// check for nans
|
|
|
|
if (std::isnan(f1[i]) || std::isnan(f2[i])) {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// check for infs: both must be inf of the same sign, or both must be finite
|
|
|
|
if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
|
|
|
|
if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
|
|
|
|
if (std::signbit(f1[i]) != std::signbit(f2[i])) {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else {
|
2024-01-04 09:43:23 +01:00
|
|
|
printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
double err = nmse(f1.data(), f2.data(), f1.size());
|
|
|
|
if (err > ud->max_err) {
|
2024-01-09 08:58:55 +01:00
|
|
|
printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
|
2024-01-02 09:57:44 +01:00
|
|
|
//for (int i = 0; i < (int) f1.size(); i++) {
|
2023-12-13 20:54:54 +01:00
|
|
|
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
2023-12-13 13:04:25 +01:00
|
|
|
//}
|
|
|
|
//printf("\n");
|
2023-12-13 20:54:54 +01:00
|
|
|
//exit(1);
|
2023-12-07 21:26:54 +01:00
|
|
|
ud->ok = false;
|
|
|
|
}
|
|
|
|
return true;
|
2023-12-13 13:04:25 +01:00
|
|
|
|
|
|
|
GGML_UNUSED(index);
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
if (!cmp_ok) {
|
|
|
|
printf("compare failed ");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
if (ud.ok && cmp_ok) {
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
return false;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
2023-12-13 20:54:54 +01:00
|
|
|
mode = MODE_PERF;
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
static const size_t graph_nodes = 8192;
|
|
|
|
|
|
|
|
ggml_init_params params = {
|
|
|
|
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
|
|
|
|
/* .mem_base = */ NULL,
|
|
|
|
/* .no_alloc = */ true,
|
|
|
|
};
|
|
|
|
ggml_context * ctx = ggml_init(params);
|
|
|
|
|
|
|
|
ggml_tensor * out = build_graph(ctx);
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
if (op_name != nullptr && op_desc(out) != op_name) {
|
|
|
|
//printf(" %s: skipping\n", op_desc(out).c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
2023-12-07 21:26:54 +01:00
|
|
|
fflush(stdout);
|
|
|
|
|
|
|
|
// check if backends support op
|
|
|
|
if (!ggml_backend_supports_op(backend, out)) {
|
|
|
|
printf("not supported\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// align while also leaving some margin for variations in parameters
|
|
|
|
int align = 20;
|
|
|
|
int last = (len + align - 1) / align * align;
|
|
|
|
if (last - len < 5) {
|
|
|
|
last += align;
|
|
|
|
}
|
|
|
|
last = std::max(last, 60);
|
|
|
|
printf("%*s", last - len, "");
|
|
|
|
|
|
|
|
// allocate
|
|
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
2024-01-12 20:07:38 +01:00
|
|
|
if (buf == NULL) {
|
|
|
|
printf("failed to allocate tensors\n");
|
|
|
|
ggml_free(ctx);
|
|
|
|
return false;
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
// randomize tensors
|
|
|
|
initialize_tensors(ctx);
|
|
|
|
|
|
|
|
// build graph
|
|
|
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
|
|
|
|
ggml_build_forward_expand(gf, out);
|
|
|
|
|
|
|
|
// warmup run
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
|
|
|
|
// duplicate the op
|
|
|
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
|
|
|
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
|
|
|
for (int i = 1; i < n_runs; i++) {
|
|
|
|
gf->nodes[gf->n_nodes++] = out;
|
|
|
|
}
|
|
|
|
|
|
|
|
// calculate memory
|
|
|
|
size_t mem = n_runs * op_size(out);
|
|
|
|
auto tensor_op_size = [](ggml_tensor * t) {
|
|
|
|
size_t size = ggml_nbytes(t);
|
|
|
|
// add source tensors
|
|
|
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
|
|
if (t->src[i] != NULL) {
|
|
|
|
size += ggml_nbytes(t->src[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
};
|
|
|
|
for (int i = 0; i < gf->n_nodes; i++) {
|
2023-12-13 13:04:25 +01:00
|
|
|
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
2023-12-07 21:26:54 +01:00
|
|
|
continue;
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
mem += tensor_op_size(gf->nodes[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// run
|
|
|
|
ggml_backend_synchronize(backend);
|
|
|
|
|
|
|
|
int64_t start_time = ggml_time_us();
|
|
|
|
ggml_backend_graph_compute(backend, gf);
|
|
|
|
ggml_backend_synchronize(backend);
|
|
|
|
int64_t end_time = ggml_time_us();
|
|
|
|
double time_us = end_time - start_time;
|
|
|
|
|
|
|
|
printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
|
|
|
|
n_runs,
|
|
|
|
time_us / n_runs,
|
|
|
|
op_size(out) / 1024,
|
|
|
|
mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
|
|
|
|
|
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_UNARY
|
|
|
|
struct test_unary : public test_case {
|
|
|
|
const ggml_unary_op op;
|
|
|
|
const ggml_type type;
|
2024-06-12 15:00:22 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
int v; // view (1 : non-contiguous a)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-06-12 15:00:22 +02:00
|
|
|
return VARS_TO_STR3(type, ne_a, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_unary(ggml_unary_op op,
|
|
|
|
ggml_type type = GGML_TYPE_F32,
|
2024-06-12 15:00:22 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
|
|
|
|
int v = 0)
|
|
|
|
: op(op), type(type), ne_a(ne_a), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-06-12 15:00:22 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
}
|
|
|
|
ggml_tensor * out = ggml_unary(ctx, a, op);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
// test extended range of values to check for NaNs in GELU
|
|
|
|
init_tensor_uniform(t, -150.f, 150.f);
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_GET_ROWS
|
|
|
|
struct test_get_rows : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const int n; // cols
|
|
|
|
const int m; // rows
|
|
|
|
const int r; // rows to get
|
2023-12-13 13:04:25 +01:00
|
|
|
const int b; // batch size
|
|
|
|
const bool v; // view (non-contiguous src1)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-13 13:04:25 +01:00
|
|
|
return VARS_TO_STR6(type, n, m, r, b, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
|
|
|
|
: type(type), n(n), m(m), r(r), b(b), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
|
|
|
|
ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
|
|
|
|
if (v) {
|
|
|
|
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_get_rows(ctx, in, rows);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
2023-12-07 21:26:54 +01:00
|
|
|
// rows
|
2023-12-13 13:04:25 +01:00
|
|
|
std::vector<int> data(r*b);
|
|
|
|
for (int i = 0; i < r*b; i++) {
|
2023-12-07 21:26:54 +01:00
|
|
|
data[i] = rand() % m;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_REPEAT
|
|
|
|
struct test_repeat : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int, 4> nr;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) * 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_repeat(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
|
|
std::array<int, 4> nr = {2, 2, 2, 2})
|
|
|
|
: type(type), ne(ne), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_repeat(ctx, src, target);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_DUP
|
|
|
|
struct test_dup : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2023-12-29 18:07:03 +01:00
|
|
|
const std::array<int64_t, 4> permute;
|
|
|
|
bool _use_permute;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-29 18:07:03 +01:00
|
|
|
std::string v = VARS_TO_STR2(type, ne);
|
|
|
|
if (_use_permute) v += "," + VAR_TO_STR(permute);
|
|
|
|
return v;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_dup(ggml_type type = GGML_TYPE_F32,
|
2024-07-17 13:23:50 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 20, 1},
|
2023-12-29 18:07:03 +01:00
|
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
|
|
|
: type(type), ne(ne), permute(permute),
|
|
|
|
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
2023-12-29 18:07:03 +01:00
|
|
|
if (_use_permute) {
|
|
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_dup(ctx, src);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CPY
|
|
|
|
struct test_cpy : public test_case {
|
|
|
|
const ggml_type type_src;
|
|
|
|
const ggml_type type_dst;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2024-07-17 13:23:50 +02:00
|
|
|
const std::array<int64_t, 4> permute;
|
|
|
|
bool _src_use_permute;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-07-17 13:23:50 +02:00
|
|
|
return VARS_TO_STR4(type_src, type_dst, ne, permute);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-06-23 13:14:45 +02:00
|
|
|
double max_nmse_err() override {
|
|
|
|
return 1e-6;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
2024-07-17 13:23:50 +02:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
2024-08-01 15:26:22 +02:00
|
|
|
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
2024-07-17 13:23:50 +02:00
|
|
|
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
|
|
|
|
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
2024-07-17 13:23:50 +02:00
|
|
|
if (_src_use_permute) {
|
|
|
|
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
|
|
|
}
|
|
|
|
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
2023-12-07 21:26:54 +01:00
|
|
|
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CONT
|
|
|
|
struct test_cont : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cont(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 1})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
src = ggml_transpose(ctx, src);
|
|
|
|
ggml_tensor * out = ggml_cont(ctx, src);
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ADD
|
|
|
|
// GGML_OP_MUL
|
|
|
|
// GGML_OP_DIV
|
|
|
|
struct test_bin_bcast : public test_case {
|
|
|
|
using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
|
|
|
|
op_t op;
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int, 4> nr;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
return ggml_nbytes(t) * 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 1, 1},
|
|
|
|
std::array<int, 4> nr = {1, 2, 1, 1})
|
|
|
|
: op(op), type(type), ne(ne), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = op(ctx, a, b);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (op == ggml_div) {
|
|
|
|
// avoid division by zero
|
|
|
|
init_tensor_uniform(t, 1.0f, 2.0f);
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SCALE
|
|
|
|
struct test_scale : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2023-12-21 22:20:49 +01:00
|
|
|
float scale;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2023-12-21 22:20:49 +01:00
|
|
|
return VARS_TO_STR3(type, ne, scale);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_scale(ggml_type type = GGML_TYPE_F32,
|
2023-12-21 22:20:49 +01:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
|
|
float scale = 2.0f)
|
|
|
|
: type(type), ne(ne), scale(scale) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_scale(ctx, a, scale);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_NORM
|
|
|
|
struct test_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float eps;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_norm(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {64, 10, 10, 10},
|
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_norm(ctx, a, eps);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_RMS_NORM
|
|
|
|
struct test_rms_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float eps;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_rms_norm(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {64, 10, 10, 10},
|
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), eps(eps) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-08-26 16:55:36 +02:00
|
|
|
// GGML_OP_SSM_CONV
|
|
|
|
struct test_ssm_conv : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_ssm_conv(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
|
|
|
std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
|
|
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
|
|
ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SSM_SCAN
|
|
|
|
struct test_ssm_scan : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
|
|
|
|
const int64_t d_state;
|
|
|
|
const int64_t d_inner;
|
|
|
|
const int64_t n_seq_tokens;
|
|
|
|
const int64_t n_seqs;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_ssm_scan(ggml_type type = GGML_TYPE_F32,
|
|
|
|
int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
|
|
: type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
|
|
|
|
ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
|
|
|
|
ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_MUL_MAT
|
|
|
|
struct test_mul_mat : public test_case {
|
|
|
|
const ggml_type type_a;
|
|
|
|
const ggml_type type_b;
|
|
|
|
const int64_t m;
|
|
|
|
const int64_t n;
|
|
|
|
const int64_t k;
|
|
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
|
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
|
|
|
size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
|
|
|
|
size_t b = ggml_nbytes(t->src[1]) * m;
|
|
|
|
size_t c = ggml_nbytes(t);
|
|
|
|
return a + b + c;
|
|
|
|
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
|
|
std::array<int64_t, 2> nr = {2, 2})
|
|
|
|
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
|
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
|
|
|
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
|
|
|
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_MUL_MAT_ID
|
|
|
|
struct test_mul_mat_id : public test_case {
|
|
|
|
const ggml_type type_a;
|
|
|
|
const ggml_type type_b;
|
|
|
|
const int n_mats;
|
2024-04-18 15:18:48 +02:00
|
|
|
const int n_used;
|
|
|
|
const bool b; // brodcast b matrix
|
2023-12-07 21:26:54 +01:00
|
|
|
const int64_t m;
|
|
|
|
const int64_t n;
|
|
|
|
const int64_t k;
|
|
|
|
|
|
|
|
std::string vars() override {
|
2024-04-18 15:18:48 +02:00
|
|
|
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t op_size(ggml_tensor * t) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
size_t a = ggml_nbytes(t->src[2]) * n;
|
2023-12-07 21:26:54 +01:00
|
|
|
size_t b = ggml_nbytes(t->src[1]) * m;
|
|
|
|
size_t c = ggml_nbytes(t);
|
|
|
|
return a + b + c;
|
|
|
|
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
2024-04-18 15:18:48 +02:00
|
|
|
int n_mats = 8, int n_used = 2, bool b = false,
|
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32)
|
|
|
|
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
|
|
|
|
m(m), n(n), k(k) {
|
|
|
|
GGML_ASSERT(n_used <= n_mats);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
2024-04-18 15:18:48 +02:00
|
|
|
ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
|
2023-12-13 13:04:25 +01:00
|
|
|
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
2024-04-18 15:18:48 +02:00
|
|
|
if (n_used != n_mats) {
|
|
|
|
ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2024-04-18 15:18:48 +02:00
|
|
|
ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
|
|
|
|
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
2023-12-13 13:04:25 +01:00
|
|
|
std::random_device rd;
|
|
|
|
std::default_random_engine rng(rd());
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
2023-12-13 13:04:25 +01:00
|
|
|
if (ggml_is_view_op(t->op)) { continue; }
|
2023-12-07 21:26:54 +01:00
|
|
|
// ids
|
2023-12-13 13:04:25 +01:00
|
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
|
|
std::vector<int32_t> data(t->ne[0]);
|
|
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
|
|
data[i] = i % n_mats;
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SQR
|
|
|
|
struct test_sqr : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sqr(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_sqr(ctx, a);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-06-17 00:23:04 +02:00
|
|
|
// GGML_OP_SQRT
|
|
|
|
struct test_sqrt : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sqrt(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_sqrt(ctx, a);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
// fill with positive values
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t, 0.0f, 100.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
// GGML_OP_SIN
|
|
|
|
struct test_sin : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sin(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_sin(ctx, a);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t, -100.0f, 100.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_COS
|
|
|
|
struct test_cos : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cos(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_cos(ctx, a);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
init_tensor_uniform(t, -100.0f, 100.0f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_CLAMP
|
|
|
|
struct test_clamp : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
float min;
|
|
|
|
float max;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne, min, max);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_clamp(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
|
|
float min = -0.5f, float max = 0.5f)
|
|
|
|
: type(type), ne(ne), min(min), max(max) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_clamp(ctx, a, min, max);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_DIAG_MASK_INF
|
|
|
|
struct test_diag_mask_inf : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int n_past;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, n_past);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
|
|
int n_past = 5)
|
|
|
|
: type(type), ne(ne), n_past(n_past) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SOFT_MAX
|
|
|
|
struct test_soft_max : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
2024-01-29 21:50:50 +01:00
|
|
|
const bool mask;
|
2024-02-17 22:04:16 +01:00
|
|
|
const float scale;
|
|
|
|
const float max_bias;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-02-17 22:04:16 +01:00
|
|
|
return VARS_TO_STR5(type, ne, mask, scale, max_bias);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
// the 1024 test with bias occasionally fails:
|
|
|
|
// SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
|
|
|
|
virtual double max_nmse_err() override {
|
|
|
|
return 1e-6;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
2024-01-29 21:50:50 +01:00
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
2024-02-17 22:04:16 +01:00
|
|
|
bool mask = false,
|
2024-01-29 21:50:50 +01:00
|
|
|
float scale = 1.0f,
|
2024-02-17 22:04:16 +01:00
|
|
|
float max_bias = 0.0f)
|
|
|
|
: type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-02-17 22:04:16 +01:00
|
|
|
ggml_tensor * mask = nullptr;
|
|
|
|
if (this->mask) {
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
2024-02-17 22:04:16 +01:00
|
|
|
}
|
2024-05-11 09:32:41 +02:00
|
|
|
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-07-17 13:23:50 +02:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_ROPE
|
|
|
|
struct test_rope : public test_case {
|
|
|
|
const ggml_type type;
|
2024-05-29 19:17:31 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
2023-12-07 21:26:54 +01:00
|
|
|
int n_dims;
|
|
|
|
int mode;
|
2024-06-05 10:29:20 +02:00
|
|
|
int n_ctx; // used to generate positions
|
2024-05-29 19:17:31 +02:00
|
|
|
float fs; // freq_scale
|
|
|
|
float ef; // ext_factor
|
|
|
|
float af; // attn_factor
|
2024-05-22 10:01:35 +02:00
|
|
|
bool ff;
|
2024-05-29 19:17:31 +02:00
|
|
|
int v; // view (1 : non-contiguous a)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-29 19:17:31 +02:00
|
|
|
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
2024-05-29 19:17:31 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
|
|
|
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
|
|
|
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-05-29 19:17:31 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
}
|
|
|
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
2024-05-22 10:01:35 +02:00
|
|
|
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
2024-06-05 10:29:20 +02:00
|
|
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// pos
|
2024-05-29 19:17:31 +02:00
|
|
|
std::vector<int> data(ne_a[2]);
|
|
|
|
for (int i = 0; i < ne_a[2]; i++) {
|
2023-12-07 21:26:54 +01:00
|
|
|
data[i] = rand() % n_ctx;
|
|
|
|
}
|
2024-05-29 19:17:31 +02:00
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
2024-05-22 10:01:35 +02:00
|
|
|
if (t->ne[0] == n_dims/2) {
|
|
|
|
// frequency factors in the range [0.9f, 1.1f]
|
|
|
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
// GGML_OP_POOL2D
|
|
|
|
struct test_pool2d : public test_case {
|
|
|
|
enum ggml_op_pool pool_type;
|
|
|
|
const ggml_type type_input;
|
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
// kernel size
|
|
|
|
const int k0;
|
|
|
|
const int k1;
|
|
|
|
// stride
|
|
|
|
const int s0;
|
|
|
|
const int s1;
|
|
|
|
// padding
|
|
|
|
const int p0;
|
|
|
|
const int p1;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
|
|
|
|
ggml_type type_input = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
int k0 = 3, int k1 = 3,
|
|
|
|
int s0 = 1, int s1 = 1,
|
|
|
|
int p0 = 1, int p1 = 1)
|
|
|
|
: pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
|
|
|
ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
// GGML_OP_CONV_TRANSPOSE_1D
|
|
|
|
struct test_conv_transpose_1d : public test_case {
|
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
|
|
|
2024-07-08 09:39:36 +02:00
|
|
|
const int s0; // stride
|
|
|
|
const int p0; // padding
|
|
|
|
const int d0; // dilation
|
2024-07-02 18:09:52 +02:00
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
|
|
int s0 = 1, int p0 = 0, int d0 = 1)
|
|
|
|
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
|
|
|
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// GGML_OP_IM2COL
|
|
|
|
struct test_im2col : public test_case {
|
|
|
|
const ggml_type type_input;
|
|
|
|
const ggml_type type_kernel;
|
2024-01-31 14:10:15 +01:00
|
|
|
const ggml_type dst_type;
|
2023-12-07 21:26:54 +01:00
|
|
|
const std::array<int64_t, 4> ne_input;
|
|
|
|
const std::array<int64_t, 4> ne_kernel;
|
|
|
|
// stride
|
|
|
|
const int s0;
|
|
|
|
const int s1;
|
|
|
|
// padding
|
|
|
|
const int p0;
|
|
|
|
const int p1;
|
2024-07-02 18:09:52 +02:00
|
|
|
// dilation
|
2023-12-07 21:26:54 +01:00
|
|
|
const int d0;
|
|
|
|
const int d1;
|
|
|
|
// mode
|
|
|
|
const bool is_2D;
|
|
|
|
|
|
|
|
std::string vars() override {
|
2024-01-31 14:10:15 +01:00
|
|
|
return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
2023-12-07 21:26:54 +01:00
|
|
|
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
|
|
|
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
|
|
|
int s0 = 1, int s1 = 1,
|
|
|
|
int p0 = 1, int p1 = 1,
|
|
|
|
int d0 = 1, int d1 = 1,
|
|
|
|
bool is_2D = true)
|
2024-01-31 14:10:15 +01:00
|
|
|
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
|
|
|
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
2024-01-31 14:10:15 +01:00
|
|
|
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_CONCAT
|
|
|
|
struct test_concat : public test_case {
|
|
|
|
const ggml_type type;
|
2024-05-28 10:04:19 +02:00
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int64_t ne_b_d;
|
|
|
|
const int dim;
|
2024-05-29 14:38:26 +02:00
|
|
|
const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-29 14:38:26 +02:00
|
|
|
return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
2024-05-28 10:04:19 +02:00
|
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
|
|
|
int64_t ne_b_d = 10,
|
2024-05-29 14:38:26 +02:00
|
|
|
int dim = 2, int v = 0)
|
|
|
|
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-05-28 10:04:19 +02:00
|
|
|
auto ne_b = ne_a;
|
|
|
|
ne_b[dim] = ne_b_d;
|
2024-05-29 14:38:26 +02:00
|
|
|
ggml_tensor * a;
|
|
|
|
if (v & 1) {
|
|
|
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
|
|
} else {
|
|
|
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
}
|
|
|
|
ggml_tensor * b;
|
|
|
|
if (v & 2) {
|
|
|
|
auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
|
|
|
|
b = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
|
|
|
|
} else {
|
|
|
|
b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
|
|
}
|
2024-05-28 10:04:19 +02:00
|
|
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
2023-12-07 21:26:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ARGSORT
|
|
|
|
struct test_argsort : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
ggml_sort_order order;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, order);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_argsort(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
2024-02-25 11:09:09 +01:00
|
|
|
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
2023-12-07 21:26:54 +01:00
|
|
|
: type(type), ne(ne), order(order) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_argsort(ctx, a, order);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
std::random_device rd;
|
|
|
|
std::default_random_engine rng(rd());
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// indices
|
|
|
|
std::vector<int> data(ggml_nelements(t));
|
|
|
|
for (int i = 0; i < ggml_nelements(t); i++) {
|
|
|
|
data[i] = rand();
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
|
|
|
|
} else if (t->type == GGML_TYPE_F32) {
|
|
|
|
// initialize with unique values to avoid ties
|
|
|
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
|
|
|
std::vector<float> data(t->ne[0]);
|
|
|
|
for (int i = 0; i < t->ne[0]; i++) {
|
|
|
|
data[i] = i;
|
|
|
|
}
|
|
|
|
std::shuffle(data.begin(), data.end(), rng);
|
|
|
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
|
|
|
}
|
|
|
|
} else {
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_SUM_ROWS
|
|
|
|
struct test_sum_rows : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_sum_rows(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_sum_rows(ctx, a);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_UPSCALE
|
|
|
|
struct test_upscale : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int32_t scale_factor;
|
2024-05-15 10:52:33 +02:00
|
|
|
const bool transpose;
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
2024-05-15 10:52:33 +02:00
|
|
|
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
2023-12-13 20:54:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
test_upscale(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
2024-05-15 10:52:33 +02:00
|
|
|
int32_t scale_factor = 2, bool transpose = false)
|
|
|
|
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-05-15 10:52:33 +02:00
|
|
|
if (transpose) a = ggml_transpose(ctx, a);
|
2023-12-13 20:54:54 +01:00
|
|
|
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-05-15 10:52:33 +02:00
|
|
|
// GGML_OP_UPSCALE (ext)
|
|
|
|
struct test_upscale_ext : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const std::array<int64_t, 4> ne_tgt;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, ne_tgt);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
|
|
|
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
|
|
|
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_GROUP_NORM
|
|
|
|
struct test_group_norm : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
const int32_t num_groups;
|
2024-08-06 09:26:46 +02:00
|
|
|
const float eps;
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne, num_groups);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_group_norm(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {64, 64, 320, 1},
|
2024-08-06 09:26:46 +02:00
|
|
|
int32_t num_groups = 32,
|
|
|
|
float eps = 1e-6f)
|
|
|
|
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}
|
2023-12-13 20:54:54 +01:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
2024-08-06 09:26:46 +02:00
|
|
|
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
|
2023-12-13 20:54:54 +01:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_ACC
|
|
|
|
struct test_acc : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const std::array<int64_t, 4> ne_b;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, ne_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_acc(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
|
|
|
|
std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
|
|
|
|
: type(type), ne_a(ne_a), ne_b(ne_b) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
|
|
|
ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_PAD
|
|
|
|
struct test_pad : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int pad_0;
|
|
|
|
const int pad_1;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_pad(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
|
|
|
|
int pad_0 = 1, int pad_1 = 1)
|
|
|
|
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-03-03 13:23:52 +01:00
|
|
|
// GGML_OP_ARANGE
|
|
|
|
struct test_arange : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const float start;
|
|
|
|
const float stop;
|
|
|
|
const float step;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, start, stop, step);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_arange(ggml_type type = GGML_TYPE_F32,
|
|
|
|
float start = 0.f, float stop = 10.f, float step = 1.f)
|
|
|
|
: type(type), start(start), stop(stop), step(step) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * out = ggml_arange(ctx, start, stop, step);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// GGML_OP_TIMESTEP_EMBEDDING
|
|
|
|
struct test_timestep_embedding : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const int dim;
|
|
|
|
const int max_period;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR4(type, ne_a, dim, max_period);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
|
|
|
|
int dim = 320, int max_period=10000)
|
|
|
|
: type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
// GGML_OP_LEAKY_RELU
|
|
|
|
struct test_leaky_relu : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne_a;
|
|
|
|
const float negative_slope;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR3(type, ne_a, negative_slope);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_leaky_relu(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
|
|
|
float negative_slope = 0.1f)
|
|
|
|
: type(type), ne_a(ne_a), negative_slope(negative_slope) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
|
|
|
ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
// GGML_OP_FLASH_ATTN_EXT
|
|
|
|
struct test_flash_attn_ext : public test_case {
|
|
|
|
const int64_t hs; // head size
|
|
|
|
const int64_t nh; // num heads
|
|
|
|
const int64_t kv; // kv size
|
|
|
|
const int64_t nb; // batch size
|
|
|
|
|
2024-05-14 18:09:30 +02:00
|
|
|
const bool mask; // use mask
|
|
|
|
|
2024-05-11 09:32:41 +02:00
|
|
|
const float max_bias; // ALiBi
|
2024-08-24 21:34:59 +02:00
|
|
|
const float logit_softcap; // Gemma 2
|
2024-05-11 09:32:41 +02:00
|
|
|
|
2024-06-01 08:44:14 +02:00
|
|
|
const ggml_type type_KV;
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
std::string vars() override {
|
2024-08-24 21:34:59 +02:00
|
|
|
return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 5e-4;
|
|
|
|
}
|
|
|
|
|
2024-08-24 21:34:59 +02:00
|
|
|
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
|
|
|
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
2024-06-01 23:26:10 +02:00
|
|
|
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
|
|
|
|
|
|
|
|
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
|
|
|
|
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
|
|
|
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
2024-05-14 18:09:30 +02:00
|
|
|
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
2024-08-24 21:34:59 +02:00
|
|
|
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
// GGML_OP_CROSS_ENTROPY_LOSS
|
|
|
|
struct test_cross_entropy_loss : public test_case {
|
|
|
|
const ggml_type type;
|
|
|
|
const std::array<int64_t, 4> ne;
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
return VARS_TO_STR2(type, ne);
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
|
|
|
|
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
|
|
|
: type(type), ne(ne) {}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
|
|
ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-01-29 21:50:50 +01:00
|
|
|
enum llm_norm_type {
|
|
|
|
LLM_NORM,
|
|
|
|
LLM_NORM_RMS,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct llama_hparams {
|
|
|
|
uint32_t n_vocab;
|
|
|
|
uint32_t n_embd;
|
|
|
|
uint32_t n_head;
|
|
|
|
uint32_t n_head_kv;
|
|
|
|
static constexpr uint32_t n_layer = 1;
|
|
|
|
uint32_t n_rot;
|
|
|
|
uint32_t n_embd_head; // dimension of values (d_v)
|
|
|
|
uint32_t n_ff;
|
|
|
|
|
|
|
|
float f_norm_eps;
|
|
|
|
float f_norm_rms_eps;
|
|
|
|
|
|
|
|
// cparams
|
|
|
|
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
2024-06-05 10:29:20 +02:00
|
|
|
static constexpr uint32_t n_ctx_orig = n_ctx;
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
// batch
|
|
|
|
int32_t n_tokens;
|
|
|
|
|
|
|
|
// llm_build_context
|
|
|
|
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
|
|
|
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
|
|
|
|
|
|
|
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
|
|
|
return n_embd_head * n_head_kv;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// LLM base class
|
|
|
|
struct test_llm : public test_case {
|
|
|
|
llama_hparams hp;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
test_llm(llama_hparams hp)
|
|
|
|
: hp(std::move(hp)) {
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
struct ggml_tensor * llm_build_norm(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * cur,
|
|
|
|
struct ggml_tensor * mw,
|
|
|
|
struct ggml_tensor * mb,
|
|
|
|
llm_norm_type type) {
|
|
|
|
switch (type) {
|
|
|
|
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
|
|
|
|
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
|
|
|
|
}
|
|
|
|
cur = ggml_mul(ctx, cur, mw);
|
|
|
|
if (mb) {
|
|
|
|
cur = ggml_add(ctx, cur, mb);
|
|
|
|
}
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
void llm_build_kv_store(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * k_l,
|
|
|
|
struct ggml_tensor * v_l,
|
|
|
|
struct ggml_tensor * k_cur,
|
|
|
|
struct ggml_tensor * v_cur) {
|
|
|
|
// compute the transposed [n_tokens, n_embd] V matrix
|
|
|
|
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
|
|
|
|
|
|
|
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
|
|
|
|
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
|
|
|
|
|
|
|
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
|
|
|
|
( hp.n_ctx)*ggml_element_size(v_l),
|
|
|
|
(hp.kv_head)*ggml_element_size(v_l));
|
|
|
|
|
|
|
|
// important: storing RoPE-ed version of K in the KV cache!
|
|
|
|
ggml_cpy(ctx, k_cur, k_cache_view);
|
|
|
|
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * llm_build_kqv(
|
|
|
|
struct ggml_context * ctx,
|
|
|
|
struct ggml_tensor * k_l,
|
|
|
|
struct ggml_tensor * v_l,
|
|
|
|
struct ggml_tensor * q_cur,
|
|
|
|
struct ggml_tensor * kq_mask,
|
|
|
|
float kq_scale) {
|
|
|
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
|
|
|
|
|
|
|
struct ggml_tensor * k =
|
|
|
|
ggml_view_3d(ctx, k_l,
|
|
|
|
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
|
|
|
|
ggml_row_size(k_l->type, hp.n_embd_gqa()),
|
|
|
|
ggml_row_size(k_l->type, hp.n_embd_head),
|
|
|
|
0);
|
|
|
|
|
|
|
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
|
|
|
2024-05-11 09:32:41 +02:00
|
|
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
// split cached v into n_head heads
|
|
|
|
struct ggml_tensor * v =
|
|
|
|
ggml_view_3d(ctx, v_l,
|
|
|
|
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
|
|
|
|
ggml_element_size(v_l)*hp.n_ctx,
|
|
|
|
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
|
|
|
|
0);
|
|
|
|
|
|
|
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
|
|
|
|
|
|
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
|
|
|
|
|
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
|
|
|
|
|
|
|
|
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
|
|
cur = ggml_mul_mat(ctx, wo, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
if (t->type == GGML_TYPE_I32) {
|
|
|
|
// pos
|
|
|
|
std::vector<int> data(hp.n_tokens);
|
|
|
|
for (int i = 0; i < hp.n_tokens; i++) {
|
|
|
|
data[i] = rand() % hp.n_ctx;
|
|
|
|
}
|
|
|
|
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
|
|
|
|
} else {
|
|
|
|
init_tensor_uniform(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Llama
|
|
|
|
struct test_llama : public test_llm {
|
|
|
|
static constexpr float freq_base = 10000.0f;
|
|
|
|
static constexpr float freq_scale = 1.0f;
|
|
|
|
static constexpr float ext_factor = 0.0f;
|
|
|
|
static constexpr float attn_factor = 1.0f;
|
|
|
|
static constexpr float beta_fast = 32.0f;
|
|
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
return "LLAMA";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
auto n_tokens = hp.n_tokens;
|
|
|
|
return VARS_TO_STR1(n_tokens);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 2e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_llama(int n_tokens = 1)
|
|
|
|
: test_llm({
|
|
|
|
/*n_vocab =*/ 32000,
|
|
|
|
/*n_embd =*/ 3200,
|
|
|
|
/*n_head =*/ 32,
|
|
|
|
/*n_head_kv =*/ 32,
|
|
|
|
/*n_rot =*/ 100,
|
|
|
|
/*n_embd_head =*/ 100,
|
|
|
|
/*n_ff =*/ 8640,
|
|
|
|
/*f_norm_eps =*/ 0.f,
|
|
|
|
/*f_norm_rms_eps =*/ 1e-5f,
|
|
|
|
/*n_tokens =*/ n_tokens,
|
|
|
|
}) {
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
struct ggml_tensor * cur;
|
|
|
|
struct ggml_tensor * inpL;
|
|
|
|
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
|
|
|
|
// inp_pos - contains the positions
|
|
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
|
|
struct ggml_tensor * inpSA = inpL;
|
|
|
|
|
|
|
|
// norm
|
|
|
|
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
// self-attention
|
|
|
|
{
|
|
|
|
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
|
|
|
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
|
|
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
|
|
|
|
|
|
|
// compute Q and K and RoPE them
|
|
|
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
|
|
|
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
|
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Qcur = ggml_rope_ext(
|
|
|
|
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
|
2024-06-05 10:29:20 +02:00
|
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
2024-01-29 21:50:50 +01:00
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Kcur = ggml_rope_ext(
|
|
|
|
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
|
2024-06-05 10:29:20 +02:00
|
|
|
hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
|
2024-01-29 21:50:50 +01:00
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
|
|
|
|
|
|
|
|
// feed-forward network
|
|
|
|
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_gate, cur);
|
|
|
|
cur = ggml_silu(ctx, cur);
|
|
|
|
cur = ggml_mul(ctx, cur, tmp);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
|
|
|
|
// input for next layer
|
|
|
|
inpL = cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = inpL;
|
|
|
|
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
|
|
|
|
|
|
|
|
// lm_head
|
|
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
|
|
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Falcon
|
|
|
|
struct test_falcon : public test_llm {
|
|
|
|
static constexpr float freq_base = 10000.0f;
|
|
|
|
static constexpr float freq_scale = 1.0f;
|
|
|
|
static constexpr float ext_factor = 0.0f;
|
|
|
|
static constexpr float attn_factor = 1.0f;
|
|
|
|
static constexpr float beta_fast = 32.0f;
|
|
|
|
static constexpr float beta_slow = 1.0f;
|
|
|
|
|
|
|
|
std::string op_desc(ggml_tensor * t) override {
|
|
|
|
GGML_UNUSED(t);
|
|
|
|
return "FALCON";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string vars() override {
|
|
|
|
auto n_tokens = hp.n_tokens;
|
|
|
|
return VARS_TO_STR1(n_tokens);
|
|
|
|
}
|
|
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
|
return 2e-3;
|
|
|
|
}
|
|
|
|
|
|
|
|
test_falcon(int n_tokens = 1)
|
|
|
|
: test_llm({
|
|
|
|
/*n_vocab =*/ 32000,
|
|
|
|
/*n_embd =*/ 3200,
|
|
|
|
/*n_head =*/ 50,
|
|
|
|
/*n_head_kv =*/ 1,
|
|
|
|
/*n_rot =*/ 64,
|
|
|
|
/*n_embd_head =*/ 64,
|
|
|
|
/*n_ff =*/ 8640,
|
|
|
|
/*f_norm_eps =*/ 1e-5f,
|
|
|
|
/*f_norm_rms_eps =*/ 0.f,
|
|
|
|
/*n_tokens =*/ n_tokens,
|
|
|
|
}) {
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
|
struct ggml_tensor * cur;
|
|
|
|
struct ggml_tensor * inpL;
|
|
|
|
|
|
|
|
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
|
|
|
|
|
|
|
// inp_pos - contains the positions
|
|
|
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
|
|
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
2024-01-29 21:50:50 +01:00
|
|
|
|
|
|
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
|
|
|
|
|
|
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
|
|
|
// norm
|
|
|
|
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
|
|
|
|
|
|
|
|
// self-attention
|
|
|
|
{
|
|
|
|
cur = attn_norm;
|
|
|
|
|
|
|
|
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
|
|
|
|
|
|
|
|
cur = ggml_mul_mat(ctx, wqkv, cur);
|
|
|
|
|
|
|
|
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
|
|
|
|
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
|
|
|
|
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
|
|
|
|
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
|
|
|
|
|
|
|
// using mode = 2 for neox mode
|
2024-05-21 22:28:32 +02:00
|
|
|
Qcur = ggml_rope_ext(
|
2024-06-05 10:29:20 +02:00
|
|
|
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
2024-01-29 21:50:50 +01:00
|
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
2024-05-21 22:28:32 +02:00
|
|
|
Kcur = ggml_rope_ext(
|
2024-06-05 10:29:20 +02:00
|
|
|
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
|
2024-01-29 21:50:50 +01:00
|
|
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
|
|
|
|
|
|
|
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ggml_tensor * ffn_inp = cur;
|
|
|
|
|
|
|
|
// feed forward
|
|
|
|
{
|
|
|
|
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
|
|
|
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
|
|
|
cur = attn_norm;
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_up, cur);
|
|
|
|
cur = ggml_gelu(ctx, cur);
|
|
|
|
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, ffn_inp);
|
|
|
|
|
|
|
|
cur = ggml_add(ctx, cur, inpL);
|
|
|
|
|
|
|
|
// input for next layer
|
|
|
|
inpL = cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
cur = inpL;
|
|
|
|
|
|
|
|
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
|
|
|
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
|
|
|
|
|
|
|
|
// lm_head
|
|
|
|
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
|
|
|
|
cur = ggml_mul_mat(ctx, output, cur);
|
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
|
|
|
std::vector<std::unique_ptr<test_case>> test_cases;
|
2024-01-09 08:58:55 +01:00
|
|
|
std::default_random_engine rng(0);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
const ggml_type all_types[] = {
|
2024-05-08 08:30:09 +02:00
|
|
|
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
2023-12-13 13:04:25 +01:00
|
|
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
|
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
|
|
GGML_TYPE_Q8_0,
|
|
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
|
|
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
2024-01-17 17:54:56 +01:00
|
|
|
GGML_TYPE_Q6_K,
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
2024-02-26 17:28:38 +01:00
|
|
|
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
2024-03-26 15:21:27 +01:00
|
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
2024-02-27 15:34:24 +01:00
|
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
2023-12-13 13:04:25 +01:00
|
|
|
};
|
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
const ggml_type base_types[] = {
|
|
|
|
GGML_TYPE_F32, GGML_TYPE_F16,
|
|
|
|
GGML_TYPE_Q4_0,
|
|
|
|
GGML_TYPE_Q4_K,
|
|
|
|
GGML_TYPE_IQ2_XXS
|
|
|
|
};
|
|
|
|
|
|
|
|
const ggml_type other_types[] = {
|
|
|
|
GGML_TYPE_Q4_1,
|
|
|
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
|
|
GGML_TYPE_Q8_0,
|
|
|
|
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
|
|
|
GGML_TYPE_Q5_K,
|
|
|
|
GGML_TYPE_Q6_K,
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-06 03:48:47 +02:00
|
|
|
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
|
2024-04-18 15:18:48 +02:00
|
|
|
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
|
|
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
|
|
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
2024-07-02 08:39:38 +02:00
|
|
|
GGML_TYPE_BF16,
|
2024-04-18 15:18:48 +02:00
|
|
|
};
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// unary ops
|
2024-06-12 15:00:22 +02:00
|
|
|
for (int v : {0, 1}) {
|
|
|
|
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
|
|
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
|
|
|
|
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
|
|
|
for (ggml_type type : all_types) {
|
|
|
|
for (int b : {1, 7}) {
|
|
|
|
for (bool v : {false, true}) {
|
|
|
|
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
2023-12-29 18:07:03 +01:00
|
|
|
for (int b : {1, 7}) {
|
|
|
|
for (bool v : {false, true}) {
|
|
|
|
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
|
|
|
|
}
|
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-01-31 14:10:15 +01:00
|
|
|
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
|
|
|
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
|
|
|
for (int k0 : {1, 3}) {
|
|
|
|
for (int k1 : {1, 3}) {
|
|
|
|
for (int s0 : {1, 2}) {
|
|
|
|
for (int s1 : {1, 2}) {
|
|
|
|
for (int p0 : {0, 1}) {
|
|
|
|
for (int p1 : {0, 1}) {
|
|
|
|
test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
2024-08-02 10:50:53 +02:00
|
|
|
// test cases for 1D im2col
|
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
|
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
2024-01-31 14:10:15 +01:00
|
|
|
|
2024-08-20 17:06:51 +02:00
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
|
|
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
|
|
|
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
|
|
// these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
|
|
|
|
2024-07-02 18:09:52 +02:00
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d());
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
|
|
|
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
|
|
|
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
|
2023-12-29 18:07:03 +01:00
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2023-12-29 18:07:03 +01:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
2023-12-29 18:07:03 +01:00
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
|
|
|
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-01-29 13:37:33 +01:00
|
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
for (ggml_type type_dst : all_types) {
|
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
|
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
2024-01-29 13:37:33 +01:00
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_cont());
|
2024-08-28 10:23:02 +02:00
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
|
|
|
|
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
|
|
|
|
for (auto op : {ggml_add, ggml_mul, ggml_div}) {
|
|
|
|
test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
|
2023-12-13 13:04:25 +01:00
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
|
2023-12-07 21:26:54 +01:00
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
|
|
|
|
|
|
|
|
// stable diffusion
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
|
|
|
|
add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
|
2023-12-13 13:04:25 +01:00
|
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
|
|
|
|
//add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
test_cases.emplace_back(new test_scale());
|
|
|
|
|
|
|
|
for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
|
|
|
|
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
|
|
|
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
|
|
|
}
|
|
|
|
|
2024-08-26 16:55:36 +02:00
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
|
|
|
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
|
|
|
|
|
2024-07-19 17:17:27 +02:00
|
|
|
#if 1
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : base_types) {
|
2023-12-29 09:32:31 +01:00
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
|
|
|
}
|
|
|
|
}
|
2024-07-19 17:17:27 +02:00
|
|
|
#else
|
|
|
|
// m = a rows
|
|
|
|
// n = b rows
|
|
|
|
// k = cols
|
|
|
|
std::uniform_int_distribution<> dist_m(1, 128);
|
|
|
|
std::uniform_int_distribution<> dist_n(16, 128);
|
|
|
|
std::uniform_int_distribution<> dist_k(1, 16);
|
|
|
|
for (int i = 0; i < 1000; i++) {
|
|
|
|
for (ggml_type type_a : all_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
|
|
int m = dist_m(rng);
|
|
|
|
int n = dist_n(rng);
|
|
|
|
int k = dist_k(rng) * ggml_blck_size(type_a);
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : other_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
2024-08-05 07:52:55 +02:00
|
|
|
if (ggml_blck_size(type_a) != 256) {
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
|
|
|
|
}
|
|
|
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
|
2024-04-18 15:18:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-22 08:36:03 +01:00
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
|
|
|
|
2024-08-20 17:06:51 +02:00
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
|
|
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
|
|
|
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
|
|
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
|
|
|
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
|
|
|
|
|
2024-04-18 15:18:48 +02:00
|
|
|
for (ggml_type type_a : base_types) {
|
2023-12-07 21:26:54 +01:00
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
2024-04-18 15:18:48 +02:00
|
|
|
for (int n_mats : {4, 8}) {
|
|
|
|
for (int n_used : {1, 2, 4}) {
|
|
|
|
for (bool b : {false, true}) {
|
|
|
|
for (int n : {1, 32}) {
|
|
|
|
int m = 512;
|
|
|
|
int k = 256;
|
|
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (ggml_type type_a : other_types) {
|
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
|
|
|
for (int n_mats : {4}) {
|
|
|
|
for (int n_used : {2}) {
|
|
|
|
for (bool b : {false}) {
|
|
|
|
for (int n : {1}) {
|
|
|
|
int m = 512;
|
|
|
|
int k = 256;
|
|
|
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_sqr());
|
2024-06-17 00:23:04 +02:00
|
|
|
test_cases.emplace_back(new test_sqrt());
|
2024-08-27 21:01:45 +02:00
|
|
|
test_cases.emplace_back(new test_sin());
|
|
|
|
test_cases.emplace_back(new test_cos());
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_clamp());
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
|
|
|
|
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
|
|
|
|
|
2024-02-17 22:04:16 +01:00
|
|
|
#if 0
|
2024-01-09 08:58:55 +01:00
|
|
|
std::uniform_int_distribution<> dist_ne1(1, 50);
|
|
|
|
int exponent = 1;
|
|
|
|
while (exponent < (1 << 17)) {
|
|
|
|
std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
|
|
|
|
|
|
|
|
for (int n = 0; n < 10; ++n) {
|
|
|
|
int64_t ne0 = dist_ne0(rng);
|
|
|
|
int64_t ne1 = dist_ne1(rng);
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
2024-01-09 08:58:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
exponent <<= 1;
|
|
|
|
}
|
2024-02-17 22:04:16 +01:00
|
|
|
#endif
|
|
|
|
for (bool mask : {false, true}) {
|
|
|
|
for (float max_bias : {0.0f, 8.0f}) {
|
2024-05-11 09:32:41 +02:00
|
|
|
if (!mask && max_bias > 0.0f) continue;
|
2024-02-17 22:04:16 +01:00
|
|
|
for (float scale : {1.0f, 0.1f}) {
|
|
|
|
for (int64_t ne0 : {16, 1024}) {
|
|
|
|
for (int64_t ne1 : {16, 1024}) {
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
|
2024-02-17 22:04:16 +01:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-07-17 13:23:50 +02:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
|
2024-02-17 22:04:16 +01:00
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
|
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
2024-01-29 21:50:50 +01:00
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
{
|
|
|
|
bool all = true;
|
|
|
|
|
|
|
|
for (float v : { 0, 1 }) {
|
|
|
|
for (float fs : { 1.0f, 1.4245f }) {
|
|
|
|
for (float ef : { 0.0f, 0.7465f }) {
|
|
|
|
for (float af : { 1.0f, 1.4245f }) {
|
|
|
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
|
|
for (bool ff : {false, true}) { // freq_factors
|
2024-06-05 10:29:20 +02:00
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
|
|
|
|
|
|
|
|
if (all) {
|
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
|
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
|
|
|
|
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
|
|
|
|
}
|
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
if (all) {
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
|
|
|
}
|
|
|
|
|
|
|
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
|
|
|
}
|
|
|
|
}
|
2024-06-05 10:29:20 +02:00
|
|
|
|
2024-05-29 19:17:31 +02:00
|
|
|
all = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-05-22 10:01:35 +02:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2024-05-29 14:38:26 +02:00
|
|
|
for (int v : { 0, 1, 2, 3 }) {
|
|
|
|
for (int dim : { 0, 1, 2, 3, }) {
|
|
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
|
|
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
|
|
|
}
|
2024-05-28 10:04:19 +02:00
|
|
|
}
|
2023-12-07 21:26:54 +01:00
|
|
|
|
2024-02-25 11:09:09 +01:00
|
|
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
2023-12-13 13:04:25 +01:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
2023-12-07 21:26:54 +01:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
2024-04-03 15:07:05 +02:00
|
|
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_sum_rows());
|
|
|
|
test_cases.emplace_back(new test_upscale());
|
2024-05-15 10:52:33 +02:00
|
|
|
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
|
|
|
test_cases.emplace_back(new test_upscale_ext());
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_group_norm());
|
|
|
|
test_cases.emplace_back(new test_acc());
|
|
|
|
test_cases.emplace_back(new test_pad());
|
2024-03-03 13:23:52 +01:00
|
|
|
test_cases.emplace_back(new test_arange());
|
|
|
|
test_cases.emplace_back(new test_timestep_embedding());
|
2023-12-13 20:54:54 +01:00
|
|
|
test_cases.emplace_back(new test_leaky_relu());
|
2023-12-13 13:04:25 +01:00
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
for (int hs : { 64, 80, 128, 256, }) {
|
2024-05-14 18:09:30 +02:00
|
|
|
for (bool mask : { true, false } ) {
|
|
|
|
for (float max_bias : { 0.0f, 8.0f }) {
|
|
|
|
if (!mask && max_bias > 0.0f) continue;
|
2024-08-24 21:34:59 +02:00
|
|
|
for (float logit_softcap : {0.0f, 10.0f}) {
|
|
|
|
if (hs != 128 && logit_softcap != 0.0f) continue;
|
|
|
|
for (int nh : { 32, }) {
|
|
|
|
for (int kv : { 512, 1024, }) {
|
|
|
|
for (int nb : { 1, 2, 4, 8, }) {
|
|
|
|
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
|
|
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
|
|
|
}
|
2024-06-01 08:44:14 +02:00
|
|
|
}
|
2024-05-14 18:09:30 +02:00
|
|
|
}
|
2024-05-11 09:32:41 +02:00
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-27 21:01:45 +02:00
|
|
|
test_cases.emplace_back(new test_cross_entropy_loss());
|
|
|
|
|
2024-02-13 10:20:24 +01:00
|
|
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
|
|
#if 0
|
2024-01-29 21:50:50 +01:00
|
|
|
test_cases.emplace_back(new test_llama(1));
|
|
|
|
test_cases.emplace_back(new test_llama(2));
|
|
|
|
test_cases.emplace_back(new test_falcon(1));
|
|
|
|
test_cases.emplace_back(new test_falcon(2));
|
|
|
|
#endif
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
// run tests
|
|
|
|
if (mode == MODE_TEST) {
|
|
|
|
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
|
|
|
|
|
|
|
size_t n_ok = 0;
|
|
|
|
for (auto & test : test_cases) {
|
|
|
|
if (test->eval(backend, backend_cpu, op_name)) {
|
|
|
|
n_ok++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
|
|
|
|
|
|
|
|
ggml_backend_free(backend_cpu);
|
|
|
|
|
|
|
|
return n_ok == test_cases.size();
|
2023-12-13 13:04:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (mode == MODE_PERF) {
|
2023-12-07 21:26:54 +01:00
|
|
|
for (auto & test : test_cases) {
|
|
|
|
test->eval_perf(backend, op_name);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-07-27 04:41:55 +02:00
|
|
|
GGML_ABORT("fatal error");
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void usage(char ** argv) {
|
|
|
|
printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
|
|
|
|
printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
|
|
|
|
printf(" op names are as given by ggml_op_desc()\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
|
|
|
test_mode mode = MODE_TEST;
|
2024-03-13 14:58:30 +01:00
|
|
|
const char * op_name_filter = NULL;
|
|
|
|
const char * backend_filter = NULL;
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
if (strcmp(argv[i], "test") == 0) {
|
|
|
|
mode = MODE_TEST;
|
|
|
|
} else if (strcmp(argv[i], "perf") == 0) {
|
|
|
|
mode = MODE_PERF;
|
|
|
|
} else if (strcmp(argv[i], "-o") == 0) {
|
|
|
|
if (i + 1 < argc) {
|
2024-03-13 14:58:30 +01:00
|
|
|
op_name_filter = argv[++i];
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
} else if (strcmp(argv[i], "-b") == 0) {
|
|
|
|
if (i + 1 < argc) {
|
2024-03-13 14:58:30 +01:00
|
|
|
backend_filter = argv[++i];
|
2023-12-07 21:26:54 +01:00
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
usage(argv);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// enumerate backends
|
|
|
|
printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
|
|
|
|
|
|
|
|
size_t n_ok = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
|
|
|
|
printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
|
|
|
|
|
2024-03-13 14:58:30 +01:00
|
|
|
if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
|
2023-12-07 21:26:54 +01:00
|
|
|
printf(" Skipping\n");
|
|
|
|
n_ok++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
|
|
|
|
GGML_ASSERT(backend != NULL);
|
2024-03-13 14:58:30 +01:00
|
|
|
|
|
|
|
if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
|
|
|
|
printf(" Skipping CPU backend\n");
|
|
|
|
ggml_backend_free(backend);
|
|
|
|
n_ok++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
printf(" Backend name: %s\n", ggml_backend_name(backend));
|
|
|
|
|
2024-03-13 14:58:30 +01:00
|
|
|
bool ok = test_backend(backend, mode, op_name_filter);
|
2023-12-07 21:26:54 +01:00
|
|
|
|
|
|
|
printf(" Backend %s: ", ggml_backend_name(backend));
|
|
|
|
if (ok) {
|
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
n_ok++;
|
|
|
|
} else {
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\n");
|
|
|
|
|
|
|
|
ggml_backend_free(backend);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2023-12-07 21:26:54 +01:00
|
|
|
if (n_ok != ggml_backend_reg_get_count()) {
|
|
|
|
printf("\033[1;31mFAIL\033[0m\n");
|
|
|
|
return 1;
|
|
|
|
}
|
2023-12-13 13:04:25 +01:00
|
|
|
|
2024-01-17 17:54:56 +01:00
|
|
|
ggml_quantize_free();
|
|
|
|
|
2023-12-13 13:04:25 +01:00
|
|
|
printf("\033[1;32mOK\033[0m\n");
|
|
|
|
return 0;
|
2023-12-07 21:26:54 +01:00
|
|
|
}
|