2023-08-18 12:44:58 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <array>
|
|
|
|
#include <cassert>
|
|
|
|
#include <chrono>
|
|
|
|
#include <cinttypes>
|
2023-08-28 19:19:18 +02:00
|
|
|
#include <clocale>
|
|
|
|
#include <cmath>
|
|
|
|
#include <cstdio>
|
2023-08-18 12:44:58 +02:00
|
|
|
#include <cstring>
|
|
|
|
#include <ctime>
|
2024-03-15 09:22:24 +01:00
|
|
|
#include <cstdlib>
|
2023-08-18 12:44:58 +02:00
|
|
|
#include <iterator>
|
|
|
|
#include <map>
|
|
|
|
#include <numeric>
|
|
|
|
#include <regex>
|
|
|
|
#include <sstream>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "ggml.h"
|
|
|
|
#include "llama.h"
|
|
|
|
#include "common.h"
|
|
|
|
#include "ggml-cuda.h"
|
2024-02-01 20:48:53 +01:00
|
|
|
#include "ggml-sycl.h"
|
2023-08-18 12:44:58 +02:00
|
|
|
|
|
|
|
// utils
|
|
|
|
static uint64_t get_time_ns() {
|
|
|
|
using clock = std::chrono::high_resolution_clock;
|
|
|
|
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class T>
|
|
|
|
static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
|
|
std::ostringstream str;
|
|
|
|
for (size_t i = 0; i < values.size(); i++) {
|
|
|
|
str << values[i];
|
|
|
|
if (i < values.size() - 1) {
|
|
|
|
str << delim;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return str.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class T>
|
|
|
|
static std::vector<T> split(const std::string & str, char delim) {
|
|
|
|
std::vector<T> values;
|
|
|
|
std::istringstream str_stream(str);
|
|
|
|
std::string token;
|
|
|
|
while (std::getline(str_stream, token, delim)) {
|
|
|
|
T value;
|
|
|
|
std::istringstream token_stream(token);
|
|
|
|
token_stream >> value;
|
|
|
|
values.push_back(value);
|
|
|
|
}
|
|
|
|
return values;
|
|
|
|
}
|
|
|
|
|
2023-12-07 12:03:17 +01:00
|
|
|
template<typename T, typename F>
|
|
|
|
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
|
|
std::vector<std::string> str_values;
|
|
|
|
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
|
|
|
return str_values;
|
|
|
|
}
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
template<typename T>
|
|
|
|
static T avg(const std::vector<T> & v) {
|
|
|
|
if (v.empty()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
T sum = std::accumulate(v.begin(), v.end(), T(0));
|
|
|
|
return sum / (T)v.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
static T stdev(const std::vector<T> & v) {
|
|
|
|
if (v.size() <= 1) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
T mean = avg(v);
|
|
|
|
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
|
|
|
|
T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
|
|
|
|
return stdev;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string get_cpu_info() {
|
|
|
|
std::string id;
|
|
|
|
#ifdef __linux__
|
|
|
|
FILE * f = fopen("/proc/cpuinfo", "r");
|
|
|
|
if (f) {
|
|
|
|
char buf[1024];
|
|
|
|
while (fgets(buf, sizeof(buf), f)) {
|
|
|
|
if (strncmp(buf, "model name", 10) == 0) {
|
|
|
|
char * p = strchr(buf, ':');
|
|
|
|
if (p) {
|
|
|
|
p++;
|
|
|
|
while (std::isspace(*p)) {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
while (std::isspace(p[strlen(p) - 1])) {
|
|
|
|
p[strlen(p) - 1] = '\0';
|
|
|
|
}
|
|
|
|
id = p;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-03-14 19:29:32 +01:00
|
|
|
fclose(f);
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
// TODO: other platforms
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string get_gpu_info() {
|
|
|
|
std::string id;
|
2024-03-26 01:16:01 +01:00
|
|
|
#ifdef GGML_USE_CUDA
|
2024-03-18 11:03:04 +01:00
|
|
|
int count = ggml_backend_cuda_get_device_count();
|
2023-08-18 12:44:58 +02:00
|
|
|
for (int i = 0; i < count; i++) {
|
|
|
|
char buf[128];
|
2024-03-18 11:03:04 +01:00
|
|
|
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
2023-08-18 12:44:58 +02:00
|
|
|
id += buf;
|
|
|
|
if (i < count - 1) {
|
|
|
|
id += "/";
|
|
|
|
}
|
|
|
|
}
|
2024-02-01 20:48:53 +01:00
|
|
|
#endif
|
|
|
|
#ifdef GGML_USE_SYCL
|
2024-03-02 12:49:30 +01:00
|
|
|
int count = ggml_backend_sycl_get_device_count();
|
|
|
|
for (int i = 0; i < count; i++) {
|
|
|
|
char buf[128];
|
|
|
|
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
|
|
|
id += buf;
|
|
|
|
if (i < count - 1) {
|
2024-02-01 20:48:53 +01:00
|
|
|
id += "/";
|
|
|
|
}
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
#endif
|
|
|
|
// TODO: other backends
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
// command line params
|
|
|
|
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
|
|
|
|
2024-01-12 20:07:38 +01:00
|
|
|
static const char * output_format_str(output_formats format) {
|
|
|
|
switch (format) {
|
|
|
|
case CSV: return "csv";
|
|
|
|
case JSON: return "json";
|
|
|
|
case MARKDOWN: return "md";
|
|
|
|
case SQL: return "sql";
|
|
|
|
default: GGML_ASSERT(!"invalid output format");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char * split_mode_str(llama_split_mode mode) {
|
|
|
|
switch (mode) {
|
2024-02-25 11:09:09 +01:00
|
|
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
|
|
|
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
|
|
|
case LLAMA_SPLIT_MODE_ROW: return "row";
|
2024-01-12 20:07:38 +01:00
|
|
|
default: GGML_ASSERT(!"invalid split mode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-10 18:03:54 +02:00
|
|
|
static std::string pair_str(const std::pair<int, int> & p) {
|
|
|
|
static char buf[32];
|
|
|
|
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
struct cmd_params {
|
|
|
|
std::vector<std::string> model;
|
|
|
|
std::vector<int> n_prompt;
|
|
|
|
std::vector<int> n_gen;
|
2024-05-10 18:03:54 +02:00
|
|
|
std::vector<std::pair<int, int>> n_pg;
|
2023-08-18 12:44:58 +02:00
|
|
|
std::vector<int> n_batch;
|
2024-03-13 18:54:21 +01:00
|
|
|
std::vector<int> n_ubatch;
|
2023-12-07 12:03:17 +01:00
|
|
|
std::vector<ggml_type> type_k;
|
|
|
|
std::vector<ggml_type> type_v;
|
2023-08-18 12:44:58 +02:00
|
|
|
std::vector<int> n_threads;
|
|
|
|
std::vector<int> n_gpu_layers;
|
2024-01-12 20:07:38 +01:00
|
|
|
std::vector<llama_split_mode> split_mode;
|
2023-08-18 12:44:58 +02:00
|
|
|
std::vector<int> main_gpu;
|
2024-01-07 17:59:01 +01:00
|
|
|
std::vector<bool> no_kv_offload;
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
std::vector<bool> flash_attn;
|
2024-01-31 16:30:17 +01:00
|
|
|
std::vector<std::vector<float>> tensor_split;
|
2024-02-01 20:48:53 +01:00
|
|
|
std::vector<bool> use_mmap;
|
2024-03-07 15:32:38 +01:00
|
|
|
std::vector<bool> embeddings;
|
2024-05-05 14:17:47 +02:00
|
|
|
ggml_numa_strategy numa;
|
2023-08-18 12:44:58 +02:00
|
|
|
int reps;
|
|
|
|
bool verbose;
|
|
|
|
output_formats output_format;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const cmd_params cmd_params_defaults = {
|
2023-08-22 09:56:03 +02:00
|
|
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
2023-08-18 12:44:58 +02:00
|
|
|
/* n_prompt */ {512},
|
|
|
|
/* n_gen */ {128},
|
2024-05-10 18:03:54 +02:00
|
|
|
/* n_pg */ {{512, 128}},
|
2024-03-13 18:54:21 +01:00
|
|
|
/* n_batch */ {2048},
|
|
|
|
/* n_ubatch */ {512},
|
2023-12-07 12:03:17 +01:00
|
|
|
/* type_k */ {GGML_TYPE_F16},
|
|
|
|
/* type_v */ {GGML_TYPE_F16},
|
ggml : add llamafile sgemm (#6414)
This change upstreams llamafile's cpu matrix multiplication kernels
which improve image and prompt evaluation speed. For starters, Q4_0
and Q8_0 weights should go ~40% faster on CPU. The biggest benefits
are with data types like f16 / f32, which process prompts 2x faster
thus making them faster than quantized data types for prompt evals.
This change also introduces bona fide AVX512 support since tinyBLAS
is able to exploit the larger register file. For example, on my CPU
llama.cpp llava-cli processes an image prompt at 305 tokens/second,
using the Q4_K and Q4_0 types, which has always been faster than if
we used f16 LLaVA weights, which at HEAD go 188 tokens/second. With
this change, f16 LLaVA performance leap frogs to 464 tokens/second.
On Intel Core i9-14900K this change improves F16 prompt perf by 5x.
For example, using llama.cpp at HEAD with Mistral 7b f16 to process
a 215 token prompt will go 13 tok/sec. This change has fixes making
it go 52 tok/sec. It's mostly thanks to my vectorized outer product
kernels but also because I added support for correctly counting the
number of cores on Alderlake, so the default thread count discounts
Intel's new efficiency cores. Only Linux right now can count cores.
This work was sponsored by Mozilla who's given permission to change
the license of this code from Apache 2.0 to MIT. To read more about
what's improved, and how it works, see: https://justine.lol/matmul/
2024-04-16 20:55:30 +02:00
|
|
|
/* n_threads */ {get_math_cpu_count()},
|
2023-08-18 12:44:58 +02:00
|
|
|
/* n_gpu_layers */ {99},
|
2024-02-25 11:09:09 +01:00
|
|
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
2023-08-18 12:44:58 +02:00
|
|
|
/* main_gpu */ {0},
|
2024-01-07 17:59:01 +01:00
|
|
|
/* no_kv_offload */ {false},
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
/* flash_attn */ {false},
|
2024-01-31 16:30:17 +01:00
|
|
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
2024-02-01 20:48:53 +01:00
|
|
|
/* use_mmap */ {true},
|
2024-03-07 15:32:38 +01:00
|
|
|
/* embeddings */ {false},
|
2024-05-05 14:17:47 +02:00
|
|
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
2023-08-18 12:44:58 +02:00
|
|
|
/* reps */ 5,
|
|
|
|
/* verbose */ false,
|
|
|
|
/* output_format */ MARKDOWN
|
|
|
|
};
|
|
|
|
|
|
|
|
static void print_usage(int /* argc */, char ** argv) {
|
2023-09-05 21:10:27 +02:00
|
|
|
printf("usage: %s [options]\n", argv[0]);
|
|
|
|
printf("\n");
|
|
|
|
printf("options:\n");
|
|
|
|
printf(" -h, --help\n");
|
2024-01-12 20:07:38 +01:00
|
|
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
|
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
|
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
2024-05-10 18:03:54 +02:00
|
|
|
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
2024-01-12 20:07:38 +01:00
|
|
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
2024-05-10 18:03:54 +02:00
|
|
|
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
|
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
|
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
2024-01-12 20:07:38 +01:00
|
|
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
|
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
|
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
|
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
|
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
2024-02-01 20:48:53 +01:00
|
|
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
2024-05-05 14:17:47 +02:00
|
|
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
2024-03-07 15:32:38 +01:00
|
|
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
2024-03-13 18:54:21 +01:00
|
|
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
2024-01-12 20:07:38 +01:00
|
|
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
|
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
|
|
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
2023-09-05 21:10:27 +02:00
|
|
|
printf("\n");
|
|
|
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
2023-12-07 12:03:17 +01:00
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
|
2023-12-07 12:03:17 +01:00
|
|
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
|
|
if (s == "f16") {
|
|
|
|
return GGML_TYPE_F16;
|
|
|
|
}
|
|
|
|
if (s == "q8_0") {
|
|
|
|
return GGML_TYPE_Q8_0;
|
|
|
|
}
|
|
|
|
if (s == "q4_0") {
|
|
|
|
return GGML_TYPE_Q4_0;
|
|
|
|
}
|
|
|
|
if (s == "q4_1") {
|
|
|
|
return GGML_TYPE_Q4_1;
|
|
|
|
}
|
|
|
|
if (s == "q5_0") {
|
|
|
|
return GGML_TYPE_Q5_0;
|
|
|
|
}
|
|
|
|
if (s == "q5_1") {
|
|
|
|
return GGML_TYPE_Q5_1;
|
|
|
|
}
|
2024-03-21 08:27:57 +01:00
|
|
|
if (s == "iq4_nl") {
|
|
|
|
return GGML_TYPE_IQ4_NL;
|
|
|
|
}
|
2023-12-07 12:03:17 +01:00
|
|
|
|
|
|
|
return GGML_TYPE_COUNT;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
|
2023-12-07 12:03:17 +01:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
|
cmd_params params;
|
|
|
|
std::string arg;
|
|
|
|
bool invalid_param = false;
|
|
|
|
const std::string arg_prefix = "--";
|
|
|
|
const char split_delim = ',';
|
|
|
|
|
|
|
|
params.verbose = cmd_params_defaults.verbose;
|
|
|
|
params.output_format = cmd_params_defaults.output_format;
|
|
|
|
params.reps = cmd_params_defaults.reps;
|
|
|
|
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
|
|
arg = argv[i];
|
|
|
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
|
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (arg == "-h" || arg == "--help") {
|
|
|
|
print_usage(argc, argv);
|
|
|
|
exit(0);
|
|
|
|
} else if (arg == "-m" || arg == "--model") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<std::string>(argv[i], split_delim);
|
|
|
|
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
|
|
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
|
|
} else if (arg == "-n" || arg == "--n-gen") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
2024-05-10 18:03:54 +02:00
|
|
|
} else if (arg == "-pg") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<std::string>(argv[i], ',');
|
|
|
|
if (p.size() != 2) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (arg == "-b" || arg == "--batch-size") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
2024-03-13 18:54:21 +01:00
|
|
|
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
2023-12-07 12:03:17 +01:00
|
|
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
2023-08-18 12:44:58 +02:00
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
2023-12-07 12:03:17 +01:00
|
|
|
auto p = split<std::string>(argv[i], split_delim);
|
|
|
|
std::vector<ggml_type> types;
|
|
|
|
for (const auto & t : p) {
|
|
|
|
ggml_type gt = ggml_type_from_name(t);
|
|
|
|
if (gt == GGML_TYPE_COUNT) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
types.push_back(gt);
|
|
|
|
}
|
|
|
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
|
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<std::string>(argv[i], split_delim);
|
|
|
|
std::vector<ggml_type> types;
|
|
|
|
for (const auto & t : p) {
|
|
|
|
ggml_type gt = ggml_type_from_name(t);
|
|
|
|
if (gt == GGML_TYPE_COUNT) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
types.push_back(gt);
|
|
|
|
}
|
|
|
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (arg == "-t" || arg == "--threads") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
|
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<int>(argv[i], split_delim);
|
|
|
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
2024-01-12 20:07:38 +01:00
|
|
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<std::string>(argv[i], split_delim);
|
|
|
|
std::vector<llama_split_mode> modes;
|
|
|
|
for (const auto & m : p) {
|
|
|
|
llama_split_mode mode;
|
|
|
|
if (m == "none") {
|
2024-02-25 11:09:09 +01:00
|
|
|
mode = LLAMA_SPLIT_MODE_NONE;
|
2024-01-12 20:07:38 +01:00
|
|
|
} else if (m == "layer") {
|
2024-02-25 11:09:09 +01:00
|
|
|
mode = LLAMA_SPLIT_MODE_LAYER;
|
2024-01-12 20:07:38 +01:00
|
|
|
} else if (m == "row") {
|
2024-02-25 11:09:09 +01:00
|
|
|
mode = LLAMA_SPLIT_MODE_ROW;
|
2024-01-12 20:07:38 +01:00
|
|
|
} else {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
modes.push_back(mode);
|
|
|
|
}
|
|
|
|
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.main_gpu = split<int>(argv[i], split_delim);
|
2024-01-07 17:59:01 +01:00
|
|
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
|
|
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
2024-05-05 14:17:47 +02:00
|
|
|
} else if (arg == "--numa") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
std::string value(argv[i]);
|
|
|
|
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
|
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
|
|
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
|
|
else { invalid_param = true; break; }
|
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
|
|
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
2024-02-01 20:48:53 +01:00
|
|
|
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
|
|
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
2024-03-07 15:32:38 +01:00
|
|
|
} else if (arg == "-embd" || arg == "--embeddings") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
|
|
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
for (auto ts : split<std::string>(argv[i], split_delim)) {
|
|
|
|
// split string by ; and /
|
|
|
|
const std::regex regex{R"([;/]+)"};
|
|
|
|
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
|
|
|
std::vector<std::string> split_arg{it, {}};
|
2024-01-31 16:30:17 +01:00
|
|
|
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
2023-08-18 12:44:58 +02:00
|
|
|
|
2024-01-31 16:30:17 +01:00
|
|
|
std::vector<float> tensor_split(llama_max_devices());
|
|
|
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
2023-08-18 12:44:58 +02:00
|
|
|
if (i < split_arg.size()) {
|
|
|
|
tensor_split[i] = std::stof(split_arg[i]);
|
|
|
|
} else {
|
|
|
|
tensor_split[i] = 0.0f;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
params.tensor_split.push_back(tensor_split);
|
|
|
|
}
|
|
|
|
} else if (arg == "-r" || arg == "--repetitions") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
params.reps = std::stoi(argv[i]);
|
|
|
|
} else if (arg == "-o" || arg == "--output") {
|
|
|
|
if (++i >= argc) {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (argv[i] == std::string("csv")) {
|
|
|
|
params.output_format = CSV;
|
|
|
|
} else if (argv[i] == std::string("json")) {
|
|
|
|
params.output_format = JSON;
|
|
|
|
} else if (argv[i] == std::string("md")) {
|
|
|
|
params.output_format = MARKDOWN;
|
|
|
|
} else if (argv[i] == std::string("sql")) {
|
|
|
|
params.output_format = SQL;
|
|
|
|
} else {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if (arg == "-v" || arg == "--verbose") {
|
|
|
|
params.verbose = true;
|
|
|
|
} else {
|
|
|
|
invalid_param = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (invalid_param) {
|
|
|
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
|
|
print_usage(argc, argv);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// set defaults
|
|
|
|
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
|
|
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
|
|
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
2024-05-10 18:03:54 +02:00
|
|
|
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
2024-03-13 18:54:21 +01:00
|
|
|
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
2023-12-07 12:03:17 +01:00
|
|
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
|
|
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
2024-01-12 20:07:38 +01:00
|
|
|
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
2024-01-07 17:59:01 +01:00
|
|
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
2024-02-01 20:48:53 +01:00
|
|
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
2024-03-07 15:32:38 +01:00
|
|
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
|
|
|
|
|
|
|
return params;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cmd_params_instance {
|
|
|
|
std::string model;
|
|
|
|
int n_prompt;
|
|
|
|
int n_gen;
|
|
|
|
int n_batch;
|
2024-03-13 18:54:21 +01:00
|
|
|
int n_ubatch;
|
2023-12-07 12:03:17 +01:00
|
|
|
ggml_type type_k;
|
|
|
|
ggml_type type_v;
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_threads;
|
|
|
|
int n_gpu_layers;
|
2024-01-12 20:07:38 +01:00
|
|
|
llama_split_mode split_mode;
|
2023-08-18 12:44:58 +02:00
|
|
|
int main_gpu;
|
2024-01-07 17:59:01 +01:00
|
|
|
bool no_kv_offload;
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
bool flash_attn;
|
2024-01-31 16:30:17 +01:00
|
|
|
std::vector<float> tensor_split;
|
2024-02-01 20:48:53 +01:00
|
|
|
bool use_mmap;
|
2024-03-07 15:32:38 +01:00
|
|
|
bool embeddings;
|
2023-08-18 12:44:58 +02:00
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_model_params to_llama_mparams() const {
|
|
|
|
llama_model_params mparams = llama_model_default_params();
|
|
|
|
|
|
|
|
mparams.n_gpu_layers = n_gpu_layers;
|
2024-01-12 20:07:38 +01:00
|
|
|
mparams.split_mode = split_mode;
|
2023-09-28 21:42:38 +02:00
|
|
|
mparams.main_gpu = main_gpu;
|
|
|
|
mparams.tensor_split = tensor_split.data();
|
2024-02-01 20:48:53 +01:00
|
|
|
mparams.use_mmap = use_mmap;
|
2023-09-28 21:42:38 +02:00
|
|
|
|
|
|
|
return mparams;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool equal_mparams(const cmd_params_instance & other) const {
|
|
|
|
return model == other.model &&
|
|
|
|
n_gpu_layers == other.n_gpu_layers &&
|
2024-01-12 20:07:38 +01:00
|
|
|
split_mode == other.split_mode &&
|
2023-09-28 21:42:38 +02:00
|
|
|
main_gpu == other.main_gpu &&
|
2024-02-01 20:48:53 +01:00
|
|
|
use_mmap == other.use_mmap &&
|
2023-09-28 21:42:38 +02:00
|
|
|
tensor_split == other.tensor_split;
|
|
|
|
}
|
|
|
|
|
|
|
|
llama_context_params to_llama_cparams() const {
|
|
|
|
llama_context_params cparams = llama_context_default_params();
|
2023-08-18 12:44:58 +02:00
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
cparams.n_ctx = n_prompt + n_gen;
|
|
|
|
cparams.n_batch = n_batch;
|
2024-03-13 18:54:21 +01:00
|
|
|
cparams.n_ubatch = n_ubatch;
|
2023-12-07 12:03:17 +01:00
|
|
|
cparams.type_k = type_k;
|
|
|
|
cparams.type_v = type_v;
|
2024-01-07 17:59:01 +01:00
|
|
|
cparams.offload_kqv = !no_kv_offload;
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
cparams.flash_attn = flash_attn;
|
2024-03-07 15:32:38 +01:00
|
|
|
cparams.embeddings = embeddings;
|
2023-09-28 21:42:38 +02:00
|
|
|
|
|
|
|
return cparams;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
|
|
|
|
std::vector<cmd_params_instance> instances;
|
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
|
|
|
for (const auto & m : params.model)
|
|
|
|
for (const auto & nl : params.n_gpu_layers)
|
2024-01-12 20:07:38 +01:00
|
|
|
for (const auto & sm : params.split_mode)
|
2023-09-28 21:42:38 +02:00
|
|
|
for (const auto & mg : params.main_gpu)
|
|
|
|
for (const auto & ts : params.tensor_split)
|
2024-02-01 20:48:53 +01:00
|
|
|
for (const auto & mmp : params.use_mmap)
|
2024-03-07 15:32:38 +01:00
|
|
|
for (const auto & embd : params.embeddings)
|
2023-09-28 21:42:38 +02:00
|
|
|
for (const auto & nb : params.n_batch)
|
2024-03-13 18:54:21 +01:00
|
|
|
for (const auto & nub : params.n_ubatch)
|
2023-12-07 12:03:17 +01:00
|
|
|
for (const auto & tk : params.type_k)
|
|
|
|
for (const auto & tv : params.type_v)
|
2024-01-07 17:59:01 +01:00
|
|
|
for (const auto & nkvo : params.no_kv_offload)
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
for (const auto & fa : params.flash_attn)
|
2023-09-28 21:42:38 +02:00
|
|
|
for (const auto & nt : params.n_threads) {
|
|
|
|
for (const auto & n_prompt : params.n_prompt) {
|
|
|
|
if (n_prompt == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cmd_params_instance instance = {
|
|
|
|
/* .model = */ m,
|
|
|
|
/* .n_prompt = */ n_prompt,
|
|
|
|
/* .n_gen = */ 0,
|
|
|
|
/* .n_batch = */ nb,
|
2024-03-13 18:54:21 +01:00
|
|
|
/* .n_ubatch = */ nub,
|
2023-12-07 12:03:17 +01:00
|
|
|
/* .type_k = */ tk,
|
|
|
|
/* .type_v = */ tv,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .n_threads = */ nt,
|
|
|
|
/* .n_gpu_layers = */ nl,
|
2024-01-12 20:07:38 +01:00
|
|
|
/* .split_mode = */ sm,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .main_gpu = */ mg,
|
2024-01-07 17:59:01 +01:00
|
|
|
/* .no_kv_offload= */ nkvo,
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
/* .flash_attn = */ fa,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .tensor_split = */ ts,
|
2024-02-01 20:48:53 +01:00
|
|
|
/* .use_mmap = */ mmp,
|
2024-03-07 15:32:38 +01:00
|
|
|
/* .embeddings = */ embd,
|
2023-09-28 21:42:38 +02:00
|
|
|
};
|
|
|
|
instances.push_back(instance);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto & n_gen : params.n_gen) {
|
|
|
|
if (n_gen == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cmd_params_instance instance = {
|
|
|
|
/* .model = */ m,
|
|
|
|
/* .n_prompt = */ 0,
|
|
|
|
/* .n_gen = */ n_gen,
|
|
|
|
/* .n_batch = */ nb,
|
2024-03-13 18:54:21 +01:00
|
|
|
/* .n_ubatch = */ nub,
|
2023-12-07 12:03:17 +01:00
|
|
|
/* .type_k = */ tk,
|
|
|
|
/* .type_v = */ tv,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .n_threads = */ nt,
|
|
|
|
/* .n_gpu_layers = */ nl,
|
2024-01-12 20:07:38 +01:00
|
|
|
/* .split_mode = */ sm,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .main_gpu = */ mg,
|
2024-01-07 17:59:01 +01:00
|
|
|
/* .no_kv_offload= */ nkvo,
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
/* .flash_attn = */ fa,
|
2023-09-28 21:42:38 +02:00
|
|
|
/* .tensor_split = */ ts,
|
2024-02-01 20:48:53 +01:00
|
|
|
/* .use_mmap = */ mmp,
|
2024-03-07 15:32:38 +01:00
|
|
|
/* .embeddings = */ embd,
|
2023-09-28 21:42:38 +02:00
|
|
|
};
|
|
|
|
instances.push_back(instance);
|
|
|
|
}
|
2024-05-10 18:03:54 +02:00
|
|
|
|
|
|
|
for (const auto & n_pg : params.n_pg) {
|
|
|
|
if (n_pg.first == 0 && n_pg.second == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
cmd_params_instance instance = {
|
|
|
|
/* .model = */ m,
|
|
|
|
/* .n_prompt = */ n_pg.first,
|
|
|
|
/* .n_gen = */ n_pg.second,
|
|
|
|
/* .n_batch = */ nb,
|
|
|
|
/* .n_ubatch = */ nub,
|
|
|
|
/* .type_k = */ tk,
|
|
|
|
/* .type_v = */ tv,
|
|
|
|
/* .n_threads = */ nt,
|
|
|
|
/* .n_gpu_layers = */ nl,
|
|
|
|
/* .split_mode = */ sm,
|
|
|
|
/* .main_gpu = */ mg,
|
|
|
|
/* .no_kv_offload= */ nkvo,
|
|
|
|
/* .flash_attn = */ fa,
|
|
|
|
/* .tensor_split = */ ts,
|
|
|
|
/* .use_mmap = */ mmp,
|
|
|
|
/* .embeddings = */ embd,
|
|
|
|
};
|
|
|
|
instances.push_back(instance);
|
|
|
|
}
|
2023-09-28 21:42:38 +02:00
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
|
|
|
|
return instances;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct test {
|
|
|
|
static const std::string build_commit;
|
|
|
|
static const int build_number;
|
|
|
|
static const bool cuda;
|
|
|
|
static const bool opencl;
|
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
|
|
|
static const bool vulkan;
|
2024-01-31 01:04:37 +01:00
|
|
|
static const bool kompute;
|
2023-08-18 12:44:58 +02:00
|
|
|
static const bool metal;
|
2024-02-01 20:48:53 +01:00
|
|
|
static const bool sycl;
|
2023-08-18 12:44:58 +02:00
|
|
|
static const bool gpu_blas;
|
|
|
|
static const bool blas;
|
|
|
|
static const std::string cpu_info;
|
|
|
|
static const std::string gpu_info;
|
|
|
|
std::string model_filename;
|
|
|
|
std::string model_type;
|
2023-08-25 15:16:19 +02:00
|
|
|
uint64_t model_size;
|
|
|
|
uint64_t model_n_params;
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_batch;
|
2024-03-13 18:54:21 +01:00
|
|
|
int n_ubatch;
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_threads;
|
2023-12-07 12:03:17 +01:00
|
|
|
ggml_type type_k;
|
|
|
|
ggml_type type_v;
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_gpu_layers;
|
2024-01-12 20:07:38 +01:00
|
|
|
llama_split_mode split_mode;
|
2023-08-18 12:44:58 +02:00
|
|
|
int main_gpu;
|
2024-01-07 17:59:01 +01:00
|
|
|
bool no_kv_offload;
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
bool flash_attn;
|
2024-01-31 16:30:17 +01:00
|
|
|
std::vector<float> tensor_split;
|
2024-02-01 20:48:53 +01:00
|
|
|
bool use_mmap;
|
2024-03-07 15:32:38 +01:00
|
|
|
bool embeddings;
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_prompt;
|
|
|
|
int n_gen;
|
|
|
|
std::string test_time;
|
|
|
|
std::vector<uint64_t> samples_ns;
|
|
|
|
|
|
|
|
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
|
|
|
model_filename = inst.model;
|
|
|
|
char buf[128];
|
2023-08-25 15:16:19 +02:00
|
|
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
2023-08-18 12:44:58 +02:00
|
|
|
model_type = buf;
|
2023-08-25 15:16:19 +02:00
|
|
|
model_size = llama_model_size(lmodel);
|
|
|
|
model_n_params = llama_model_n_params(lmodel);
|
2023-08-18 12:44:58 +02:00
|
|
|
n_batch = inst.n_batch;
|
2024-03-13 18:54:21 +01:00
|
|
|
n_ubatch = inst.n_ubatch;
|
2023-08-18 12:44:58 +02:00
|
|
|
n_threads = inst.n_threads;
|
2023-12-07 12:03:17 +01:00
|
|
|
type_k = inst.type_k;
|
|
|
|
type_v = inst.type_v;
|
2023-08-18 12:44:58 +02:00
|
|
|
n_gpu_layers = inst.n_gpu_layers;
|
2024-01-12 20:07:38 +01:00
|
|
|
split_mode = inst.split_mode;
|
2023-08-18 12:44:58 +02:00
|
|
|
main_gpu = inst.main_gpu;
|
2024-01-07 17:59:01 +01:00
|
|
|
no_kv_offload = inst.no_kv_offload;
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
flash_attn = inst.flash_attn;
|
2023-08-18 12:44:58 +02:00
|
|
|
tensor_split = inst.tensor_split;
|
2024-02-01 20:48:53 +01:00
|
|
|
use_mmap = inst.use_mmap;
|
2024-03-07 15:32:38 +01:00
|
|
|
embeddings = inst.embeddings;
|
2023-08-18 12:44:58 +02:00
|
|
|
n_prompt = inst.n_prompt;
|
|
|
|
n_gen = inst.n_gen;
|
|
|
|
// RFC 3339 date-time format
|
|
|
|
time_t t = time(NULL);
|
|
|
|
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
|
|
|
test_time = buf;
|
|
|
|
|
|
|
|
(void) ctx;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t avg_ns() const {
|
|
|
|
return ::avg(samples_ns);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t stdev_ns() const {
|
|
|
|
return ::stdev(samples_ns);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<double> get_ts() const {
|
|
|
|
int n_tokens = n_prompt + n_gen;
|
|
|
|
std::vector<double> ts;
|
|
|
|
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
|
|
|
|
return ts;
|
|
|
|
}
|
|
|
|
|
|
|
|
double avg_ts() const {
|
|
|
|
return ::avg(get_ts());
|
|
|
|
}
|
|
|
|
|
|
|
|
double stdev_ts() const {
|
|
|
|
return ::stdev(get_ts());
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string get_backend() {
|
|
|
|
if (cuda) {
|
2023-08-25 11:09:42 +02:00
|
|
|
return GGML_CUDA_NAME;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
if (opencl) {
|
|
|
|
return "OpenCL";
|
|
|
|
}
|
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
|
|
|
if (vulkan) {
|
|
|
|
return "Vulkan";
|
|
|
|
}
|
2024-01-31 01:04:37 +01:00
|
|
|
if (kompute) {
|
|
|
|
return "Kompute";
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
if (metal) {
|
|
|
|
return "Metal";
|
|
|
|
}
|
2024-02-01 20:48:53 +01:00
|
|
|
if (sycl) {
|
|
|
|
return GGML_SYCL_NAME;
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
if (gpu_blas) {
|
|
|
|
return "GPU BLAS";
|
|
|
|
}
|
|
|
|
if (blas) {
|
|
|
|
return "BLAS";
|
|
|
|
}
|
2024-02-01 20:48:53 +01:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
return "CPU";
|
|
|
|
}
|
|
|
|
|
|
|
|
static const std::vector<std::string> & get_fields() {
|
|
|
|
static const std::vector<std::string> fields = {
|
|
|
|
"build_commit", "build_number",
|
2024-02-01 20:48:53 +01:00
|
|
|
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
|
2023-08-18 12:44:58 +02:00
|
|
|
"cpu_info", "gpu_info",
|
2023-08-25 15:16:19 +02:00
|
|
|
"model_filename", "model_type", "model_size", "model_n_params",
|
2024-03-13 18:54:21 +01:00
|
|
|
"n_batch", "n_ubatch",
|
|
|
|
"n_threads", "type_k", "type_v",
|
2024-01-12 20:07:38 +01:00
|
|
|
"n_gpu_layers", "split_mode",
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
"main_gpu", "no_kv_offload", "flash_attn",
|
2024-03-07 15:32:38 +01:00
|
|
|
"tensor_split", "use_mmap", "embeddings",
|
2023-08-18 12:44:58 +02:00
|
|
|
"n_prompt", "n_gen", "test_time",
|
|
|
|
"avg_ns", "stddev_ns",
|
|
|
|
"avg_ts", "stddev_ts"
|
|
|
|
};
|
|
|
|
return fields;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum field_type {STRING, BOOL, INT, FLOAT};
|
|
|
|
|
|
|
|
static field_type get_field_type(const std::string & field) {
|
2024-03-13 18:54:21 +01:00
|
|
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
|
|
|
field == "n_threads" ||
|
2023-08-25 15:16:19 +02:00
|
|
|
field == "model_size" || field == "model_n_params" ||
|
2023-08-18 12:44:58 +02:00
|
|
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
|
|
|
field == "n_prompt" || field == "n_gen" ||
|
|
|
|
field == "avg_ns" || field == "stddev_ns") {
|
|
|
|
return INT;
|
|
|
|
}
|
2024-01-31 01:04:37 +01:00
|
|
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
2024-02-01 20:48:53 +01:00
|
|
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
2023-08-18 12:44:58 +02:00
|
|
|
return BOOL;
|
|
|
|
}
|
|
|
|
if (field == "avg_ts" || field == "stddev_ts") {
|
|
|
|
return FLOAT;
|
|
|
|
}
|
|
|
|
return STRING;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> get_values() const {
|
|
|
|
std::string tensor_split_str;
|
|
|
|
int max_nonzero = 0;
|
2024-01-31 16:30:17 +01:00
|
|
|
for (size_t i = 0; i < llama_max_devices(); i++) {
|
2023-08-18 12:44:58 +02:00
|
|
|
if (tensor_split[i] > 0) {
|
|
|
|
max_nonzero = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0; i <= max_nonzero; i++) {
|
|
|
|
char buf[32];
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
|
|
|
|
tensor_split_str += buf;
|
|
|
|
if (i < max_nonzero) {
|
|
|
|
tensor_split_str += "/";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::vector<std::string> values = {
|
|
|
|
build_commit, std::to_string(build_number),
|
2024-01-31 01:04:37 +01:00
|
|
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
2024-02-01 20:48:53 +01:00
|
|
|
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
2023-08-18 12:44:58 +02:00
|
|
|
cpu_info, gpu_info,
|
2023-08-25 15:16:19 +02:00
|
|
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
2024-03-13 18:54:21 +01:00
|
|
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
|
|
|
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
2024-01-12 20:07:38 +01:00
|
|
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
2024-03-07 15:32:38 +01:00
|
|
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
2023-08-18 12:44:58 +02:00
|
|
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
|
|
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
|
|
|
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
|
|
|
};
|
|
|
|
return values;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<std::string, std::string> get_map() const {
|
|
|
|
std::map<std::string, std::string> map;
|
|
|
|
auto fields = get_fields();
|
|
|
|
auto values = get_values();
|
|
|
|
std::transform(fields.begin(), fields.end(), values.begin(),
|
|
|
|
std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
|
|
|
|
return map;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-11-02 07:50:16 +01:00
|
|
|
const std::string test::build_commit = LLAMA_COMMIT;
|
|
|
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
2024-03-26 01:16:01 +01:00
|
|
|
const bool test::cuda = !!ggml_cpu_has_cuda();
|
2023-08-18 12:44:58 +02:00
|
|
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
|
|
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
2024-01-31 01:04:37 +01:00
|
|
|
const bool test::kompute = !!ggml_cpu_has_kompute();
|
2023-08-18 12:44:58 +02:00
|
|
|
const bool test::metal = !!ggml_cpu_has_metal();
|
|
|
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
|
|
|
const bool test::blas = !!ggml_cpu_has_blas();
|
2024-02-01 20:48:53 +01:00
|
|
|
const bool test::sycl = !!ggml_cpu_has_sycl();
|
2023-08-18 12:44:58 +02:00
|
|
|
const std::string test::cpu_info = get_cpu_info();
|
|
|
|
const std::string test::gpu_info = get_gpu_info();
|
|
|
|
|
|
|
|
struct printer {
|
2023-08-21 22:07:43 +02:00
|
|
|
virtual ~printer() {}
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
FILE * fout;
|
2023-09-28 23:41:44 +02:00
|
|
|
virtual void print_header(const cmd_params & params) { (void) params; }
|
2023-08-18 12:44:58 +02:00
|
|
|
virtual void print_test(const test & t) = 0;
|
2023-09-28 23:41:44 +02:00
|
|
|
virtual void print_footer() { }
|
2023-08-18 12:44:58 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
struct csv_printer : public printer {
|
|
|
|
static std::string escape_csv(const std::string & field) {
|
|
|
|
std::string escaped = "\"";
|
|
|
|
for (auto c : field) {
|
|
|
|
if (c == '"') {
|
|
|
|
escaped += "\"";
|
|
|
|
}
|
|
|
|
escaped += c;
|
|
|
|
}
|
|
|
|
escaped += "\"";
|
|
|
|
return escaped;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_header(const cmd_params & params) override {
|
|
|
|
std::vector<std::string> fields = test::get_fields();
|
|
|
|
fprintf(fout, "%s\n", join(fields, ",").c_str());
|
|
|
|
(void) params;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_test(const test & t) override {
|
|
|
|
std::vector<std::string> values = t.get_values();
|
|
|
|
std::transform(values.begin(), values.end(), values.begin(), escape_csv);
|
|
|
|
fprintf(fout, "%s\n", join(values, ",").c_str());
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct json_printer : public printer {
|
|
|
|
bool first = true;
|
|
|
|
|
|
|
|
static std::string escape_json(const std::string & value) {
|
|
|
|
std::string escaped;
|
|
|
|
for (auto c : value) {
|
|
|
|
if (c == '"') {
|
|
|
|
escaped += "\\\"";
|
|
|
|
} else if (c == '\\') {
|
|
|
|
escaped += "\\\\";
|
|
|
|
} else if (c <= 0x1f) {
|
|
|
|
char buf[8];
|
|
|
|
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
|
|
escaped += buf;
|
|
|
|
} else {
|
|
|
|
escaped += c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return escaped;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string format_value(const std::string & field, const std::string & value) {
|
|
|
|
switch (test::get_field_type(field)) {
|
|
|
|
case test::STRING:
|
|
|
|
return "\"" + escape_json(value) + "\"";
|
|
|
|
case test::BOOL:
|
|
|
|
return value == "0" ? "false" : "true";
|
|
|
|
default:
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_header(const cmd_params & params) override {
|
|
|
|
fprintf(fout, "[\n");
|
|
|
|
(void) params;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
|
|
assert(fields.size() == values.size());
|
|
|
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
|
|
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_test(const test & t) override {
|
|
|
|
if (first) {
|
|
|
|
first = false;
|
|
|
|
} else {
|
|
|
|
fprintf(fout, ",\n");
|
|
|
|
}
|
|
|
|
fprintf(fout, " {\n");
|
|
|
|
print_fields(test::get_fields(), t.get_values());
|
|
|
|
fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
|
|
|
|
fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
|
|
|
|
fprintf(fout, " }");
|
|
|
|
fflush(fout);
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_footer() override {
|
|
|
|
fprintf(fout, "\n]\n");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct markdown_printer : public printer {
|
|
|
|
std::vector<std::string> fields;
|
|
|
|
|
|
|
|
static int get_field_width(const std::string & field) {
|
|
|
|
if (field == "model") {
|
|
|
|
return -30;
|
|
|
|
}
|
|
|
|
if (field == "t/s") {
|
2023-08-25 15:16:19 +02:00
|
|
|
return 16;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
2023-08-25 15:16:19 +02:00
|
|
|
if (field == "size" || field == "params") {
|
|
|
|
return 10;
|
|
|
|
}
|
|
|
|
if (field == "n_gpu_layers") {
|
|
|
|
return 3;
|
|
|
|
}
|
2024-05-10 18:03:54 +02:00
|
|
|
if (field == "test") {
|
|
|
|
return 13;
|
|
|
|
}
|
2023-08-25 15:16:19 +02:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
int width = std::max((int)field.length(), 10);
|
|
|
|
|
|
|
|
if (test::get_field_type(field) == test::STRING) {
|
|
|
|
return -width;
|
|
|
|
}
|
|
|
|
return width;
|
|
|
|
}
|
|
|
|
|
2023-08-25 15:16:19 +02:00
|
|
|
static std::string get_field_display_name(const std::string & field) {
|
|
|
|
if (field == "n_gpu_layers") {
|
|
|
|
return "ngl";
|
|
|
|
}
|
2024-01-12 20:07:38 +01:00
|
|
|
if (field == "split_mode") {
|
|
|
|
return "sm";
|
|
|
|
}
|
2023-08-25 15:16:19 +02:00
|
|
|
if (field == "n_threads") {
|
|
|
|
return "threads";
|
|
|
|
}
|
2024-01-07 17:59:01 +01:00
|
|
|
if (field == "no_kv_offload") {
|
|
|
|
return "nkvo";
|
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
if (field == "flash_attn") {
|
|
|
|
return "fa";
|
|
|
|
}
|
2024-02-01 20:48:53 +01:00
|
|
|
if (field == "use_mmap") {
|
|
|
|
return "mmap";
|
|
|
|
}
|
2024-03-07 15:32:38 +01:00
|
|
|
if (field == "embeddings") {
|
|
|
|
return "embd";
|
|
|
|
}
|
2023-08-25 15:16:19 +02:00
|
|
|
if (field == "tensor_split") {
|
|
|
|
return "ts";
|
|
|
|
}
|
|
|
|
return field;
|
|
|
|
}
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
void print_header(const cmd_params & params) override {
|
|
|
|
// select fields to print
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("model");
|
|
|
|
fields.emplace_back("size");
|
|
|
|
fields.emplace_back("params");
|
|
|
|
fields.emplace_back("backend");
|
2023-08-18 12:44:58 +02:00
|
|
|
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
|
|
|
if (!is_cpu_backend) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("n_gpu_layers");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
2023-08-22 09:56:03 +02:00
|
|
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("n_threads");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("n_batch");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
2024-03-13 18:54:21 +01:00
|
|
|
if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
|
|
|
|
fields.emplace_back("n_ubatch");
|
|
|
|
}
|
2023-12-07 12:03:17 +01:00
|
|
|
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("type_k");
|
2023-12-07 12:03:17 +01:00
|
|
|
}
|
|
|
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("type_v");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("main_gpu");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
2024-01-12 20:07:38 +01:00
|
|
|
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("split_mode");
|
2024-01-12 20:07:38 +01:00
|
|
|
}
|
2024-01-07 17:59:01 +01:00
|
|
|
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("no_kv_offload");
|
2024-01-07 17:59:01 +01:00
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
|
|
|
fields.emplace_back("flash_attn");
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("tensor_split");
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
2024-02-01 20:48:53 +01:00
|
|
|
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("use_mmap");
|
2024-02-01 20:48:53 +01:00
|
|
|
}
|
2024-03-07 15:32:38 +01:00
|
|
|
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
|
|
|
fields.emplace_back("embeddings");
|
|
|
|
}
|
2024-02-03 12:23:37 +01:00
|
|
|
fields.emplace_back("test");
|
|
|
|
fields.emplace_back("t/s");
|
2023-08-18 12:44:58 +02:00
|
|
|
|
|
|
|
fprintf(fout, "|");
|
|
|
|
for (const auto & field : fields) {
|
2023-08-25 15:16:19 +02:00
|
|
|
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
fprintf(fout, "\n");
|
|
|
|
fprintf(fout, "|");
|
|
|
|
for (const auto & field : fields) {
|
|
|
|
int width = get_field_width(field);
|
|
|
|
fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
|
|
|
|
}
|
|
|
|
fprintf(fout, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_test(const test & t) override {
|
|
|
|
std::map<std::string, std::string> vmap = t.get_map();
|
|
|
|
|
|
|
|
fprintf(fout, "|");
|
|
|
|
for (const auto & field : fields) {
|
|
|
|
std::string value;
|
2023-08-25 15:16:19 +02:00
|
|
|
char buf[128];
|
2023-08-18 12:44:58 +02:00
|
|
|
if (field == "model") {
|
|
|
|
value = t.model_type;
|
2023-08-25 15:16:19 +02:00
|
|
|
} else if (field == "size") {
|
|
|
|
if (t.model_size < 1024*1024*1024) {
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
|
|
|
} else {
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
|
|
|
}
|
|
|
|
value = buf;
|
|
|
|
} else if (field == "params") {
|
|
|
|
if (t.model_n_params < 1000*1000*1000) {
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
|
|
|
} else {
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
|
|
|
}
|
|
|
|
value = buf;
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (field == "backend") {
|
|
|
|
value = test::get_backend();
|
|
|
|
} else if (field == "test") {
|
|
|
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
2024-05-10 18:03:54 +02:00
|
|
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
2023-08-18 12:44:58 +02:00
|
|
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
2024-05-10 18:03:54 +02:00
|
|
|
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
|
2023-08-18 12:44:58 +02:00
|
|
|
} else {
|
2024-05-10 18:03:54 +02:00
|
|
|
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
value = buf;
|
|
|
|
} else if (field == "t/s") {
|
|
|
|
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
|
|
|
value = buf;
|
|
|
|
} else if (vmap.find(field) != vmap.end()) {
|
|
|
|
value = vmap.at(field);
|
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
int width = get_field_width(field);
|
|
|
|
if (field == "t/s") {
|
|
|
|
// HACK: the utf-8 character is 2 bytes
|
|
|
|
width += 1;
|
|
|
|
}
|
|
|
|
fprintf(fout, " %*s |", width, value.c_str());
|
|
|
|
}
|
|
|
|
fprintf(fout, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_footer() override {
|
|
|
|
fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sql_printer : public printer {
|
|
|
|
static std::string get_sql_field_type(const std::string & field) {
|
|
|
|
switch (test::get_field_type(field)) {
|
|
|
|
case test::STRING:
|
|
|
|
return "TEXT";
|
|
|
|
case test::BOOL:
|
|
|
|
case test::INT:
|
|
|
|
return "INTEGER";
|
|
|
|
case test::FLOAT:
|
|
|
|
return "REAL";
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_header(const cmd_params & params) override {
|
|
|
|
std::vector<std::string> fields = test::get_fields();
|
|
|
|
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
|
|
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
|
|
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
|
|
|
|
}
|
|
|
|
fprintf(fout, ");\n");
|
|
|
|
fprintf(fout, "\n");
|
|
|
|
(void) params;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_test(const test & t) override {
|
|
|
|
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
|
|
|
|
fprintf(fout, "VALUES (");
|
|
|
|
std::vector<std::string> values = t.get_values();
|
|
|
|
for (size_t i = 0; i < values.size(); i++) {
|
|
|
|
fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
|
|
|
|
}
|
|
|
|
fprintf(fout, ");\n");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
2024-03-13 18:54:21 +01:00
|
|
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
|
|
|
2024-03-15 09:22:24 +01:00
|
|
|
const llama_model * model = llama_get_model(ctx);
|
|
|
|
const int32_t n_vocab = llama_n_vocab(model);
|
|
|
|
|
|
|
|
std::vector<llama_token> tokens(n_batch);
|
2024-03-13 18:54:21 +01:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
int n_processed = 0;
|
2023-09-28 21:42:38 +02:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
while (n_processed < n_prompt) {
|
|
|
|
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
2024-03-15 09:22:24 +01:00
|
|
|
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
|
|
|
for (int i = 1; i < n_tokens; i++) {
|
|
|
|
tokens[i] = std::rand() % n_vocab;
|
|
|
|
}
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
|
2023-08-18 12:44:58 +02:00
|
|
|
n_processed += n_tokens;
|
|
|
|
}
|
2024-03-13 18:54:21 +01:00
|
|
|
|
|
|
|
llama_synchronize(ctx);
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
|
|
|
2024-03-15 09:22:24 +01:00
|
|
|
const llama_model * model = llama_get_model(ctx);
|
|
|
|
const int32_t n_vocab = llama_n_vocab(model);
|
|
|
|
|
|
|
|
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
2024-03-13 18:54:21 +01:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
for (int i = 0; i < n_gen; i++) {
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
|
2024-03-13 18:54:21 +01:00
|
|
|
llama_synchronize(ctx);
|
2024-03-15 09:22:24 +01:00
|
|
|
token = std::rand() % n_vocab;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-27 17:48:33 +02:00
|
|
|
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
2023-08-18 12:44:58 +02:00
|
|
|
(void) level;
|
|
|
|
(void) text;
|
|
|
|
(void) user_data;
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char ** argv) {
|
2023-08-28 19:19:18 +02:00
|
|
|
// try to set locale for unicode characters in markdown
|
|
|
|
setlocale(LC_CTYPE, ".UTF-8");
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
#if !defined(NDEBUG)
|
|
|
|
fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
|
|
|
|
fprintf(stderr, "warning: debug build, performance may be affected\n");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
|
|
|
|
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
cmd_params params = parse_cmd_params(argc, argv);
|
|
|
|
|
|
|
|
// initialize llama.cpp
|
|
|
|
if (!params.verbose) {
|
|
|
|
llama_log_set(llama_null_log_callback, NULL);
|
|
|
|
}
|
2024-02-16 10:31:07 +01:00
|
|
|
llama_backend_init();
|
2024-05-05 14:17:47 +02:00
|
|
|
llama_numa_init(params.numa);
|
2023-08-18 12:44:58 +02:00
|
|
|
|
|
|
|
// initialize printer
|
|
|
|
std::unique_ptr<printer> p;
|
|
|
|
switch (params.output_format) {
|
|
|
|
case CSV:
|
|
|
|
p.reset(new csv_printer());
|
|
|
|
break;
|
|
|
|
case JSON:
|
|
|
|
p.reset(new json_printer());
|
|
|
|
break;
|
|
|
|
case MARKDOWN:
|
|
|
|
p.reset(new markdown_printer());
|
|
|
|
break;
|
|
|
|
case SQL:
|
|
|
|
p.reset(new sql_printer());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
p->fout = stdout;
|
|
|
|
p->print_header(params);
|
|
|
|
|
|
|
|
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_model * lmodel = nullptr;
|
|
|
|
const cmd_params_instance * prev_inst = nullptr;
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
for (const auto & inst : params_instances) {
|
2023-09-28 21:42:38 +02:00
|
|
|
// keep the same model between tests when possible
|
|
|
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
|
|
if (lmodel) {
|
|
|
|
llama_free_model(lmodel);
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
|
|
|
if (lmodel == NULL) {
|
|
|
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
prev_inst = &inst;
|
2023-08-18 12:44:58 +02:00
|
|
|
}
|
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
|
2023-08-18 12:44:58 +02:00
|
|
|
if (ctx == NULL) {
|
|
|
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
|
|
|
llama_free_model(lmodel);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
test t(inst, lmodel, ctx);
|
|
|
|
|
2023-10-29 18:31:40 +01:00
|
|
|
llama_kv_cache_clear(ctx);
|
2023-09-28 18:04:36 +02:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
// warmup run
|
2023-09-07 15:52:34 +02:00
|
|
|
if (t.n_prompt > 0) {
|
2024-03-13 18:54:21 +01:00
|
|
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
|
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
2023-09-07 15:52:34 +02:00
|
|
|
}
|
|
|
|
if (t.n_gen > 0) {
|
|
|
|
test_gen(ctx, 1, 0, t.n_threads);
|
|
|
|
}
|
2023-08-18 12:44:58 +02:00
|
|
|
|
|
|
|
for (int i = 0; i < params.reps; i++) {
|
2023-10-29 18:31:40 +01:00
|
|
|
llama_kv_cache_clear(ctx);
|
2023-09-28 18:04:36 +02:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
uint64_t t_start = get_time_ns();
|
2024-05-10 18:03:54 +02:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
if (t.n_prompt > 0) {
|
|
|
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
|
|
|
}
|
|
|
|
if (t.n_gen > 0) {
|
|
|
|
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
|
|
|
}
|
2024-03-13 18:54:21 +01:00
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
uint64_t t_ns = get_time_ns() - t_start;
|
|
|
|
t.samples_ns.push_back(t_ns);
|
|
|
|
}
|
|
|
|
|
|
|
|
p->print_test(t);
|
|
|
|
|
|
|
|
llama_print_timings(ctx);
|
|
|
|
|
|
|
|
llama_free(ctx);
|
|
|
|
}
|
|
|
|
|
2023-09-28 21:42:38 +02:00
|
|
|
llama_free_model(lmodel);
|
|
|
|
|
2023-08-18 12:44:58 +02:00
|
|
|
p->print_footer();
|
|
|
|
|
|
|
|
llama_backend_free();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|