2023-09-15 21:38:27 +02:00
# define LLAMA_API_INTERNAL
2023-03-22 06:32:36 +01:00
# include "llama.h"
2023-10-03 09:16:26 +02:00
# include "unicode.h"
2023-03-22 06:32:36 +01:00
# include "ggml.h"
2023-08-21 22:07:43 +02:00
2023-08-22 15:25:19 +02:00
# include "ggml-alloc.h"
2023-08-21 22:07:43 +02:00
2023-05-13 15:38:36 +02:00
# ifdef GGML_USE_CUBLAS
2023-08-21 22:07:43 +02:00
# include "ggml-cuda.h"
2023-05-22 23:33:24 +02:00
# elif defined(GGML_USE_CLBLAST)
2023-08-21 22:07:43 +02:00
# include "ggml-opencl.h"
2023-05-13 15:38:36 +02:00
# endif
2023-03-22 06:32:36 +01:00
2023-06-04 22:34:30 +02:00
# ifdef GGML_USE_METAL
2023-08-21 22:07:43 +02:00
# include "ggml-metal.h"
2023-06-04 22:34:30 +02:00
# endif
2023-07-10 17:49:56 +02:00
# ifdef GGML_USE_MPI
2023-08-21 22:07:43 +02:00
# include "ggml-mpi.h"
2023-07-10 17:49:56 +02:00
# endif
2023-06-18 10:13:43 +02:00
# ifdef GGML_USE_K_QUANTS
2023-08-21 22:07:43 +02:00
# ifndef QK_K
# ifdef GGML_QKK_64
# define QK_K 64
# else
# define QK_K 256
# endif
# endif
2023-06-18 10:13:43 +02:00
# endif
2023-08-21 22:07:43 +02:00
# ifdef __has_include
# if __has_include(<unistd.h>)
# include <unistd.h>
# if defined(_POSIX_MAPPED_FILES)
# include <sys/mman.h>
# endif
# if defined(_POSIX_MEMLOCK_RANGE)
# include <sys/resource.h>
# endif
# endif
2023-06-18 10:13:43 +02:00
# endif
2023-08-21 22:07:43 +02:00
# if defined(_WIN32)
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <io.h>
# include <stdio.h> // for _fseeki64
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 18:43:07 +02:00
# endif
2023-06-04 22:34:30 +02:00
2023-08-21 22:07:43 +02:00
# include <algorithm>
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
# include <array>
2023-08-21 22:07:43 +02:00
# include <cassert>
2023-03-22 06:32:36 +01:00
# include <cinttypes>
2023-08-21 22:07:43 +02:00
# include <climits>
# include <cstdarg>
2023-08-26 20:17:51 +02:00
# include <cstddef>
# include <cstdint>
# include <cstdio>
2023-08-21 22:07:43 +02:00
# include <cstring>
# include <ctime>
2023-03-22 06:32:36 +01:00
# include <fstream>
2023-08-21 22:07:43 +02:00
# include <initializer_list>
2023-03-24 22:17:37 +01:00
# include <map>
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
# include <memory>
2023-04-20 19:42:27 +02:00
# include <mutex>
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
# include <numeric>
2023-08-21 22:07:43 +02:00
# include <queue>
# include <random>
2023-08-23 22:08:04 +02:00
# include <regex>
2023-08-21 22:07:43 +02:00
# include <sstream>
# include <thread>
# include <unordered_map>
2023-09-28 18:04:36 +02:00
# include <set>
2023-03-29 22:51:37 +02:00
2023-06-16 20:23:53 +02:00
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
2023-08-21 22:07:43 +02:00
# ifdef __GNUC__
# ifdef __MINGW32__
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
2023-07-30 15:58:01 +02:00
# else
2023-08-21 22:07:43 +02:00
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
# else
# define LLAMA_ATTRIBUTE_FORMAT(...)
2023-07-30 15:58:01 +02:00
# endif
2023-08-21 22:07:43 +02:00
//
// logging
//
2023-08-23 22:08:04 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_ATTRIBUTE_FORMAT ( 2 , 3 )
2023-09-27 17:48:33 +02:00
static void llama_log_internal ( ggml_log_level level , const char * format , . . . ) ;
static void llama_log_callback_default ( ggml_log_level level , const char * text , void * user_data ) ;
2023-03-24 22:17:37 +01:00
2023-09-27 17:48:33 +02:00
# define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
# define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
# define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
2023-03-24 22:17:37 +01:00
2023-08-21 22:07:43 +02:00
//
// helpers
//
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
static size_t utf8_len ( char src ) {
const size_t lookup [ ] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 3 , 4 } ;
uint8_t highbits = static_cast < uint8_t > ( src ) > > 4 ;
return lookup [ highbits ] ;
}
2023-09-15 21:38:27 +02:00
static void replace_all ( std : : string & s , const std : : string & search , const std : : string & replace ) {
2023-08-27 15:50:33 +02:00
std : : string result ;
for ( size_t pos = 0 ; ; pos + = search . length ( ) ) {
auto new_pos = s . find ( search , pos ) ;
if ( new_pos = = std : : string : : npos ) {
result + = s . substr ( pos , s . size ( ) - pos ) ;
break ;
}
result + = s . substr ( pos , new_pos - pos ) + replace ;
pos = new_pos ;
2023-08-23 22:08:04 +02:00
}
2023-08-27 15:50:33 +02:00
s = std : : move ( result ) ;
2023-08-23 22:08:04 +02:00
}
2023-10-06 12:47:59 +02:00
static bool is_float_close ( float a , float b , float abs_tol ) {
// Check for non-negative tolerance
if ( abs_tol < 0.0 ) {
throw std : : invalid_argument ( " Tolerance must be non-negative " ) ;
}
// Exact equality check
if ( a = = b ) {
return true ;
}
// Check for infinities
if ( std : : isinf ( a ) | | std : : isinf ( b ) ) {
return false ;
}
// Regular comparison using the provided absolute tolerance
return std : : fabs ( b - a ) < = abs_tol ;
}
2023-09-08 03:46:56 +02:00
# ifdef GGML_USE_CPU_HBM
# include <hbwmalloc.h>
# endif
2023-08-23 22:08:04 +02:00
2023-08-21 22:07:43 +02:00
static void zeros ( std : : ofstream & file , size_t n ) {
char zero = 0 ;
for ( size_t i = 0 ; i < n ; + + i ) {
file . write ( & zero , 1 ) ;
}
}
2023-06-06 21:33:23 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_ATTRIBUTE_FORMAT ( 1 , 2 )
static std : : string format ( const char * fmt , . . . ) {
va_list ap ;
va_list ap2 ;
va_start ( ap , fmt ) ;
va_copy ( ap2 , ap ) ;
int size = vsnprintf ( NULL , 0 , fmt , ap ) ;
GGML_ASSERT ( size > = 0 & & size < INT_MAX ) ; // NOLINT
std : : vector < char > buf ( size + 1 ) ;
int size2 = vsnprintf ( buf . data ( ) , size + 1 , fmt , ap2 ) ;
GGML_ASSERT ( size2 = = size ) ;
va_end ( ap2 ) ;
va_end ( ap ) ;
return std : : string ( buf . data ( ) , size ) ;
2023-06-06 21:33:23 +02:00
}
2023-08-23 22:08:04 +02:00
//
// gguf constants (sync with gguf.py)
//
enum llm_arch {
LLM_ARCH_LLAMA ,
LLM_ARCH_FALCON ,
2023-09-14 18:32:10 +02:00
LLM_ARCH_BAICHUAN ,
2023-08-23 22:08:04 +02:00
LLM_ARCH_GPT2 ,
LLM_ARCH_GPTJ ,
LLM_ARCH_GPTNEOX ,
LLM_ARCH_MPT ,
2023-09-15 21:02:13 +02:00
LLM_ARCH_STARCODER ,
2023-10-04 15:23:39 +02:00
LLM_ARCH_REFACT ,
2023-08-23 22:08:04 +02:00
LLM_ARCH_UNKNOWN ,
} ;
static std : : map < llm_arch , std : : string > LLM_ARCH_NAMES = {
2023-09-28 18:04:36 +02:00
{ LLM_ARCH_LLAMA , " llama " } ,
{ LLM_ARCH_FALCON , " falcon " } ,
{ LLM_ARCH_GPT2 , " gpt2 " } ,
{ LLM_ARCH_GPTJ , " gptj " } ,
{ LLM_ARCH_GPTNEOX , " gptneox " } ,
{ LLM_ARCH_MPT , " mpt " } ,
{ LLM_ARCH_BAICHUAN , " baichuan " } ,
2023-09-15 21:02:13 +02:00
{ LLM_ARCH_STARCODER , " starcoder " } ,
2023-10-04 15:23:39 +02:00
{ LLM_ARCH_REFACT , " refact " } ,
2023-08-23 22:08:04 +02:00
} ;
enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE ,
LLM_KV_GENERAL_QUANTIZATION_VERSION ,
LLM_KV_GENERAL_ALIGNMENT ,
LLM_KV_GENERAL_NAME ,
LLM_KV_GENERAL_AUTHOR ,
LLM_KV_GENERAL_URL ,
LLM_KV_GENERAL_DESCRIPTION ,
LLM_KV_GENERAL_LICENSE ,
LLM_KV_GENERAL_SOURCE_URL ,
LLM_KV_GENERAL_SOURCE_HF_REPO ,
LLM_KV_CONTEXT_LENGTH ,
LLM_KV_EMBEDDING_LENGTH ,
LLM_KV_BLOCK_COUNT ,
LLM_KV_FEED_FORWARD_LENGTH ,
LLM_KV_USE_PARALLEL_RESIDUAL ,
LLM_KV_TENSOR_DATA_LAYOUT ,
LLM_KV_ATTENTION_HEAD_COUNT ,
LLM_KV_ATTENTION_HEAD_COUNT_KV ,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS ,
LLM_KV_ATTENTION_CLAMP_KQV ,
LLM_KV_ATTENTION_LAYERNORM_EPS ,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS ,
LLM_KV_ROPE_DIMENSION_COUNT ,
2023-08-24 20:04:05 +02:00
LLM_KV_ROPE_FREQ_BASE ,
2023-08-23 22:08:04 +02:00
LLM_KV_ROPE_SCALE_LINEAR ,
LLM_KV_TOKENIZER_MODEL ,
LLM_KV_TOKENIZER_LIST ,
LLM_KV_TOKENIZER_TOKEN_TYPE ,
LLM_KV_TOKENIZER_SCORES ,
LLM_KV_TOKENIZER_MERGES ,
LLM_KV_TOKENIZER_BOS_ID ,
LLM_KV_TOKENIZER_EOS_ID ,
LLM_KV_TOKENIZER_UNK_ID ,
LLM_KV_TOKENIZER_SEP_ID ,
LLM_KV_TOKENIZER_PAD_ID ,
LLM_KV_TOKENIZER_HF_JSON ,
LLM_KV_TOKENIZER_RWKV ,
} ;
static std : : map < llm_kv , std : : string > LLM_KV_NAMES = {
2023-09-27 18:18:07 +02:00
{ LLM_KV_GENERAL_ARCHITECTURE , " general.architecture " } ,
{ LLM_KV_GENERAL_QUANTIZATION_VERSION , " general.quantization_version " } ,
{ LLM_KV_GENERAL_ALIGNMENT , " general.alignment " } ,
{ LLM_KV_GENERAL_NAME , " general.name " } ,
{ LLM_KV_GENERAL_AUTHOR , " general.author " } ,
{ LLM_KV_GENERAL_URL , " general.url " } ,
{ LLM_KV_GENERAL_DESCRIPTION , " general.description " } ,
{ LLM_KV_GENERAL_LICENSE , " general.license " } ,
{ LLM_KV_GENERAL_SOURCE_URL , " general.source.url " } ,
{ LLM_KV_GENERAL_SOURCE_HF_REPO , " general.source.huggingface.repository " } ,
2023-08-23 22:08:04 +02:00
{ LLM_KV_CONTEXT_LENGTH , " %s.context_length " } ,
{ LLM_KV_EMBEDDING_LENGTH , " %s.embedding_length " } ,
{ LLM_KV_BLOCK_COUNT , " %s.block_count " } ,
{ LLM_KV_FEED_FORWARD_LENGTH , " %s.feed_forward_length " } ,
{ LLM_KV_USE_PARALLEL_RESIDUAL , " %s.use_parallel_residual " } ,
{ LLM_KV_TENSOR_DATA_LAYOUT , " %s.tensor_data_layout " } ,
{ LLM_KV_ATTENTION_HEAD_COUNT , " %s.attention.head_count " } ,
{ LLM_KV_ATTENTION_HEAD_COUNT_KV , " %s.attention.head_count_kv " } ,
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS , " %s.attention.max_alibi_bias " } ,
{ LLM_KV_ATTENTION_CLAMP_KQV , " %s.attention.clamp_kqv " } ,
{ LLM_KV_ATTENTION_LAYERNORM_EPS , " %s.attention.layer_norm_epsilon " } ,
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS , " %s.attention.layer_norm_rms_epsilon " } ,
{ LLM_KV_ROPE_DIMENSION_COUNT , " %s.rope.dimension_count " } ,
2023-08-24 20:04:05 +02:00
{ LLM_KV_ROPE_FREQ_BASE , " %s.rope.freq_base " } ,
2023-08-23 22:08:04 +02:00
{ LLM_KV_ROPE_SCALE_LINEAR , " %s.rope.scale_linear " } ,
{ LLM_KV_TOKENIZER_MODEL , " tokenizer.ggml.model " } ,
{ LLM_KV_TOKENIZER_LIST , " tokenizer.ggml.tokens " } ,
{ LLM_KV_TOKENIZER_TOKEN_TYPE , " tokenizer.ggml.token_type " } ,
{ LLM_KV_TOKENIZER_SCORES , " tokenizer.ggml.scores " } ,
{ LLM_KV_TOKENIZER_MERGES , " tokenizer.ggml.merges " } ,
{ LLM_KV_TOKENIZER_BOS_ID , " tokenizer.ggml.bos_token_id " } ,
{ LLM_KV_TOKENIZER_EOS_ID , " tokenizer.ggml.eos_token_id " } ,
{ LLM_KV_TOKENIZER_UNK_ID , " tokenizer.ggml.unknown_token_id " } ,
{ LLM_KV_TOKENIZER_SEP_ID , " tokenizer.ggml.seperator_token_id " } ,
{ LLM_KV_TOKENIZER_PAD_ID , " tokenizer.ggml.padding_token_id " } ,
{ LLM_KV_TOKENIZER_HF_JSON , " tokenizer.huggingface.json " } ,
{ LLM_KV_TOKENIZER_RWKV , " tokenizer.rwkv.world " } ,
} ;
struct LLM_KV {
LLM_KV ( llm_arch arch ) : arch ( arch ) { }
llm_arch arch ;
std : : string operator ( ) ( llm_kv kv ) const {
return : : format ( LLM_KV_NAMES [ kv ] . c_str ( ) , LLM_ARCH_NAMES [ arch ] . c_str ( ) ) ;
}
} ;
enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD ,
LLM_TENSOR_POS_EMBD ,
LLM_TENSOR_OUTPUT ,
LLM_TENSOR_OUTPUT_NORM ,
LLM_TENSOR_ROPE_FREQS ,
LLM_TENSOR_ATTN_Q ,
LLM_TENSOR_ATTN_K ,
LLM_TENSOR_ATTN_V ,
LLM_TENSOR_ATTN_QKV ,
LLM_TENSOR_ATTN_OUT ,
LLM_TENSOR_ATTN_NORM ,
LLM_TENSOR_ATTN_NORM_2 ,
LLM_TENSOR_ATTN_ROT_EMBD ,
LLM_TENSOR_FFN_GATE ,
LLM_TENSOR_FFN_DOWN ,
LLM_TENSOR_FFN_UP ,
LLM_TENSOR_FFN_NORM ,
} ;
static std : : map < llm_arch , std : : map < llm_tensor , std : : string > > LLM_TENSOR_NAMES = {
{
LLM_ARCH_LLAMA ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ROPE_FREQS , " rope_freqs " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_Q , " blk.%d.attn_q " } ,
{ LLM_TENSOR_ATTN_K , " blk.%d.attn_k " } ,
{ LLM_TENSOR_ATTN_V , " blk.%d.attn_v " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_ATTN_ROT_EMBD , " blk.%d.attn_rot_embd " } ,
{ LLM_TENSOR_FFN_NORM , " blk.%d.ffn_norm " } ,
{ LLM_TENSOR_FFN_GATE , " blk.%d.ffn_gate " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
} ,
} ,
2023-09-14 18:32:10 +02:00
{
LLM_ARCH_BAICHUAN ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ROPE_FREQS , " rope_freqs " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_Q , " blk.%d.attn_q " } ,
{ LLM_TENSOR_ATTN_K , " blk.%d.attn_k " } ,
{ LLM_TENSOR_ATTN_V , " blk.%d.attn_v " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_ATTN_ROT_EMBD , " blk.%d.attn_rot_embd " } ,
{ LLM_TENSOR_FFN_NORM , " blk.%d.ffn_norm " } ,
{ LLM_TENSOR_FFN_GATE , " blk.%d.ffn_gate " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
} ,
} ,
2023-08-23 22:08:04 +02:00
{
LLM_ARCH_FALCON ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_NORM_2 , " blk.%d.attn_norm_2 " } ,
{ LLM_TENSOR_ATTN_QKV , " blk.%d.attn_qkv " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
} ,
} ,
2023-09-03 07:36:28 +02:00
{
LLM_ARCH_GPT2 ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
} ,
} ,
{
LLM_ARCH_GPTJ ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
} ,
} ,
{
LLM_ARCH_GPTNEOX ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_QKV , " blk.%d.attn_qkv " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_FFN_NORM , " blk.%d.ffn_norm " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
} ,
} ,
{
LLM_ARCH_MPT ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
} ,
} ,
2023-09-15 21:02:13 +02:00
{
LLM_ARCH_STARCODER ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_POS_EMBD , " position_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_QKV , " blk.%d.attn_qkv " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_FFN_NORM , " blk.%d.ffn_norm " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
} ,
} ,
2023-10-04 15:23:39 +02:00
{
LLM_ARCH_REFACT ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
{ LLM_TENSOR_OUTPUT_NORM , " output_norm " } ,
{ LLM_TENSOR_OUTPUT , " output " } ,
{ LLM_TENSOR_ATTN_NORM , " blk.%d.attn_norm " } ,
{ LLM_TENSOR_ATTN_Q , " blk.%d.attn_q " } ,
{ LLM_TENSOR_ATTN_K , " blk.%d.attn_k " } ,
{ LLM_TENSOR_ATTN_V , " blk.%d.attn_v " } ,
{ LLM_TENSOR_ATTN_OUT , " blk.%d.attn_output " } ,
{ LLM_TENSOR_FFN_NORM , " blk.%d.ffn_norm " } ,
{ LLM_TENSOR_FFN_GATE , " blk.%d.ffn_gate " } ,
{ LLM_TENSOR_FFN_DOWN , " blk.%d.ffn_down " } ,
{ LLM_TENSOR_FFN_UP , " blk.%d.ffn_up " } ,
} ,
} ,
2023-09-03 07:36:28 +02:00
{
LLM_ARCH_UNKNOWN ,
{
{ LLM_TENSOR_TOKEN_EMBD , " token_embd " } ,
} ,
} ,
2023-08-23 22:08:04 +02:00
} ;
static llm_arch llm_arch_from_string ( const std : : string & name ) {
for ( const auto & kv : LLM_ARCH_NAMES ) { // NOLINT
if ( kv . second = = name ) {
return kv . first ;
}
}
return LLM_ARCH_UNKNOWN ;
}
// helper to handle gguf constants
// usage:
//
// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
//
// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
//
struct LLM_TN {
LLM_TN ( llm_arch arch ) : arch ( arch ) { }
llm_arch arch ;
std : : string operator ( ) ( llm_tensor tensor ) const {
return LLM_TENSOR_NAMES [ arch ] . at ( tensor ) ;
}
std : : string operator ( ) ( llm_tensor tensor , const std : : string & suffix ) const {
return LLM_TENSOR_NAMES [ arch ] . at ( tensor ) + " . " + suffix ;
}
std : : string operator ( ) ( llm_tensor tensor , int bid ) const {
return : : format ( LLM_TENSOR_NAMES [ arch ] . at ( tensor ) . c_str ( ) , bid ) ;
}
std : : string operator ( ) ( llm_tensor tensor , const std : : string & suffix , int bid ) const {
return : : format ( LLM_TENSOR_NAMES [ arch ] . at ( tensor ) . c_str ( ) , bid ) + " . " + suffix ;
}
} ;
//
// gguf helpers
//
# define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
2023-09-28 23:41:44 +02:00
do { \
2023-08-23 22:08:04 +02:00
const std : : string skey ( key ) ; \
const int kid = gguf_find_key ( ctx , skey . c_str ( ) ) ; \
if ( kid > = 0 ) { \
enum gguf_type ktype = gguf_get_kv_type ( ctx , kid ) ; \
if ( ktype ! = ( type ) ) { \
throw std : : runtime_error ( format ( " key %s has wrong type: %s " , skey . c_str ( ) , gguf_type_name ( ktype ) ) ) ; \
} \
( dst ) = func ( ctx , kid ) ; \
} else if ( req ) { \
throw std : : runtime_error ( format ( " key not found in model: %s " , skey . c_str ( ) ) ) ; \
} \
2023-09-28 23:41:44 +02:00
} while ( 0 )
2023-08-23 22:08:04 +02:00
2023-07-07 18:24:01 +02:00
//
// ggml helpers
//
static void ggml_graph_compute_helper ( std : : vector < uint8_t > & buf , ggml_cgraph * graph , int n_threads ) {
struct ggml_cplan plan = ggml_graph_plan ( graph , n_threads ) ;
if ( plan . work_size > 0 ) {
buf . resize ( plan . work_size ) ;
plan . work_data = buf . data ( ) ;
}
ggml_graph_compute ( graph , & plan ) ;
}
2023-08-21 22:07:43 +02:00
//
// llama helpers
//
# ifdef GGML_USE_CUBLAS
# define llama_host_malloc(n) ggml_cuda_host_malloc(n)
# define llama_host_free(data) ggml_cuda_host_free(data)
# elif GGML_USE_METAL
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
# define llama_host_free(data) ggml_metal_host_free(data)
2023-09-08 03:46:56 +02:00
# elif GGML_USE_CPU_HBM
# define llama_host_malloc(n) hbw_malloc(n)
# define llama_host_free(data) if (data != NULL) hbw_free(data)
2023-08-21 22:07:43 +02:00
# else
# define llama_host_malloc(n) malloc(n)
# define llama_host_free(data) free(data)
# endif
# if defined(_WIN32)
static std : : string llama_format_win_err ( DWORD err ) {
LPSTR buf ;
size_t size = FormatMessageA ( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS ,
NULL , err , MAKELANGID ( LANG_NEUTRAL , SUBLANG_DEFAULT ) , ( LPSTR ) & buf , 0 , NULL ) ;
if ( ! size ) {
return " FormatMessageA failed " ;
}
std : : string ret ( buf , size ) ;
LocalFree ( buf ) ;
return ret ;
}
# endif
struct llama_buffer {
void * data = NULL ;
size_t size = 0 ;
// fallback to malloc / free
// useful in cases where CUDA can try to allocate PINNED memory
bool fallback = false ;
void resize ( size_t n ) {
llama_host_free ( data ) ;
data = llama_host_malloc ( n ) ;
if ( ! data ) {
fallback = true ;
data = malloc ( n ) ;
} else {
fallback = false ;
}
GGML_ASSERT ( data ) ;
size = n ;
}
~ llama_buffer ( ) {
if ( data ) {
if ( fallback ) { // NOLINT
free ( data ) ;
} else {
llama_host_free ( data ) ;
}
}
data = NULL ;
}
} ;
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp ;
size_t size ;
llama_file ( const char * fname , const char * mode ) {
fp = std : : fopen ( fname , mode ) ;
if ( fp = = NULL ) {
throw std : : runtime_error ( format ( " failed to open %s: %s " , fname , strerror ( errno ) ) ) ;
}
seek ( 0 , SEEK_END ) ;
size = tell ( ) ;
seek ( 0 , SEEK_SET ) ;
}
size_t tell ( ) const {
# ifdef _WIN32
__int64 ret = _ftelli64 ( fp ) ;
# else
long ret = std : : ftell ( fp ) ;
# endif
GGML_ASSERT ( ret ! = - 1 ) ; // this really shouldn't fail
return ( size_t ) ret ;
}
void seek ( size_t offset , int whence ) const {
# ifdef _WIN32
int ret = _fseeki64 ( fp , ( __int64 ) offset , whence ) ;
# else
int ret = std : : fseek ( fp , ( long ) offset , whence ) ;
# endif
GGML_ASSERT ( ret = = 0 ) ; // same
}
void read_raw ( void * ptr , size_t len ) const {
if ( len = = 0 ) {
return ;
}
errno = 0 ;
std : : size_t ret = std : : fread ( ptr , len , 1 , fp ) ;
if ( ferror ( fp ) ) {
throw std : : runtime_error ( format ( " read error: %s " , strerror ( errno ) ) ) ;
}
if ( ret ! = 1 ) {
throw std : : runtime_error ( std : : string ( " unexpectedly reached end of file " ) ) ;
}
}
uint32_t read_u32 ( ) const {
uint32_t ret ;
read_raw ( & ret , sizeof ( ret ) ) ;
return ret ;
}
void write_raw ( const void * ptr , size_t len ) const {
if ( len = = 0 ) {
return ;
}
errno = 0 ;
size_t ret = std : : fwrite ( ptr , len , 1 , fp ) ;
if ( ret ! = 1 ) {
throw std : : runtime_error ( format ( " write error: %s " , strerror ( errno ) ) ) ;
}
}
void write_u32 ( std : : uint32_t val ) const {
write_raw ( & val , sizeof ( val ) ) ;
}
~ llama_file ( ) {
if ( fp ) {
std : : fclose ( fp ) ;
}
}
} ;
struct llama_mmap {
void * addr ;
size_t size ;
llama_mmap ( const llama_mmap & ) = delete ;
# ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true ;
llama_mmap ( struct llama_file * file , size_t prefetch = ( size_t ) - 1 /* -1 = max value */ , bool numa = false ) {
size = file - > size ;
int fd = fileno ( file - > fp ) ;
int flags = MAP_SHARED ;
// prefetch/readahead impairs performance on NUMA systems
if ( numa ) { prefetch = 0 ; }
# ifdef __linux__
if ( prefetch ) { flags | = MAP_POPULATE ; }
# endif
addr = mmap ( NULL , file - > size , PROT_READ , flags , fd , 0 ) ;
if ( addr = = MAP_FAILED ) {
throw std : : runtime_error ( format ( " mmap failed: %s " , strerror ( errno ) ) ) ;
}
if ( prefetch > 0 ) {
// Advise the kernel to preload the mapped memory
ggml : posixify madvise and pagesize (#3037)
* llama : use posix_madvise() instead of madvise() derived from BSD
sed -i 's,\<madvise\>,posix_&,g;s,\<MADV_,POSIX_&,g' llama.cpp
* ggml : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD
sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml.c
* metal : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD
sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml-metal.m
2023-09-07 10:15:06 +02:00
if ( posix_madvise ( addr , std : : min ( file - > size , prefetch ) , POSIX_MADV_WILLNEED ) ) {
fprintf ( stderr , " warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s \n " ,
2023-08-21 22:07:43 +02:00
strerror ( errno ) ) ;
}
}
if ( numa ) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
ggml : posixify madvise and pagesize (#3037)
* llama : use posix_madvise() instead of madvise() derived from BSD
sed -i 's,\<madvise\>,posix_&,g;s,\<MADV_,POSIX_&,g' llama.cpp
* ggml : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD
sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml.c
* metal : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD
sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml-metal.m
2023-09-07 10:15:06 +02:00
if ( posix_madvise ( addr , file - > size , POSIX_MADV_RANDOM ) ) {
fprintf ( stderr , " warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s \n " ,
2023-08-21 22:07:43 +02:00
strerror ( errno ) ) ;
}
}
}
~ llama_mmap ( ) {
munmap ( addr , size ) ;
}
# elif defined(_WIN32)
static constexpr bool SUPPORTED = true ;
llama_mmap ( struct llama_file * file , bool prefetch = true , bool numa = false ) {
( void ) numa ;
size = file - > size ;
HANDLE hFile = ( HANDLE ) _get_osfhandle ( _fileno ( file - > fp ) ) ;
HANDLE hMapping = CreateFileMappingA ( hFile , NULL , PAGE_READONLY , 0 , 0 , NULL ) ;
DWORD error = GetLastError ( ) ;
if ( hMapping = = NULL ) {
throw std : : runtime_error ( format ( " CreateFileMappingA failed: %s " , llama_format_win_err ( error ) . c_str ( ) ) ) ;
}
addr = MapViewOfFile ( hMapping , FILE_MAP_READ , 0 , 0 , 0 ) ;
error = GetLastError ( ) ;
CloseHandle ( hMapping ) ;
if ( addr = = NULL ) {
throw std : : runtime_error ( format ( " MapViewOfFile failed: %s " , llama_format_win_err ( error ) . c_str ( ) ) ) ;
}
if ( prefetch ) {
2023-08-31 13:21:45 +02:00
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
BOOL ( WINAPI * pPrefetchVirtualMemory ) ( HANDLE , ULONG_PTR , PWIN32_MEMORY_RANGE_ENTRY , ULONG ) ;
HMODULE hKernel32 = GetModuleHandleW ( L " kernel32.dll " ) ;
// may fail on pre-Windows 8 systems
pPrefetchVirtualMemory = reinterpret_cast < decltype ( pPrefetchVirtualMemory ) > ( GetProcAddress ( hKernel32 , " PrefetchVirtualMemory " ) ) ;
if ( pPrefetchVirtualMemory ) {
// advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range ;
range . VirtualAddress = addr ;
range . NumberOfBytes = ( SIZE_T ) size ;
if ( ! pPrefetchVirtualMemory ( GetCurrentProcess ( ) , 1 , & range , 0 ) ) {
fprintf ( stderr , " warning: PrefetchVirtualMemory failed: %s \n " ,
llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
}
2023-08-21 22:07:43 +02:00
}
}
}
~ llama_mmap ( ) {
if ( ! UnmapViewOfFile ( addr ) ) {
fprintf ( stderr , " warning: UnmapViewOfFile failed: %s \n " ,
llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
}
}
# else
static constexpr bool SUPPORTED = false ;
llama_mmap ( struct llama_file * file , bool prefetch = true , bool numa = false ) {
( void ) file ;
( void ) prefetch ;
( void ) numa ;
throw std : : runtime_error ( std : : string ( " mmap not supported " ) ) ;
}
# endif
} ;
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL ;
size_t size = 0 ;
bool failed_already = false ;
llama_mlock ( ) { }
llama_mlock ( const llama_mlock & ) = delete ;
~ llama_mlock ( ) {
if ( size ) {
raw_unlock ( addr , size ) ;
}
}
void init ( void * ptr ) {
GGML_ASSERT ( addr = = NULL & & size = = 0 ) ; // NOLINT
addr = ptr ;
}
void grow_to ( size_t target_size ) {
GGML_ASSERT ( addr ) ;
if ( failed_already ) {
return ;
}
size_t granularity = lock_granularity ( ) ;
target_size = ( target_size + granularity - 1 ) & ~ ( granularity - 1 ) ;
if ( target_size > size ) {
if ( raw_lock ( ( uint8_t * ) addr + size , target_size - size ) ) {
size = target_size ;
} else {
failed_already = true ;
}
}
}
# ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true ;
static size_t lock_granularity ( ) {
return ( size_t ) sysconf ( _SC_PAGESIZE ) ;
}
# ifdef __APPLE__
# define MLOCK_SUGGESTION \
" Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
" decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l). \n "
# else
# define MLOCK_SUGGESTION \
" Try increasing RLIMIT_MLOCK ('ulimit -l' as root). \n "
# endif
bool raw_lock ( const void * addr , size_t size ) const {
if ( ! mlock ( addr , size ) ) {
return true ;
}
char * errmsg = std : : strerror ( errno ) ;
bool suggest = ( errno = = ENOMEM ) ;
// Check if the resource limit is fine after all
struct rlimit lock_limit ;
if ( suggest & & getrlimit ( RLIMIT_MEMLOCK , & lock_limit ) ) {
suggest = false ;
}
if ( suggest & & ( lock_limit . rlim_max > lock_limit . rlim_cur + size ) ) {
suggest = false ;
}
fprintf ( stderr , " warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s \n %s " ,
size , this - > size , errmsg , suggest ? MLOCK_SUGGESTION : " " ) ;
return false ;
}
# undef MLOCK_SUGGESTION
static void raw_unlock ( void * addr , size_t size ) {
if ( munlock ( addr , size ) ) {
fprintf ( stderr , " warning: failed to munlock buffer: %s \n " , std : : strerror ( errno ) ) ;
}
}
# elif defined(_WIN32)
static constexpr bool SUPPORTED = true ;
static size_t lock_granularity ( ) {
SYSTEM_INFO si ;
GetSystemInfo ( & si ) ;
return ( size_t ) si . dwPageSize ;
}
bool raw_lock ( void * ptr , size_t len ) const {
for ( int tries = 1 ; ; tries + + ) {
if ( VirtualLock ( ptr , len ) ) {
return true ;
}
if ( tries = = 2 ) {
fprintf ( stderr , " warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s \n " ,
len , size , llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
return false ;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size , max_ws_size ;
if ( ! GetProcessWorkingSetSize ( GetCurrentProcess ( ) , & min_ws_size , & max_ws_size ) ) {
fprintf ( stderr , " warning: GetProcessWorkingSetSize failed: %s \n " ,
llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
return false ;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576 ;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size + = increment ;
max_ws_size + = increment ;
if ( ! SetProcessWorkingSetSize ( GetCurrentProcess ( ) , min_ws_size , max_ws_size ) ) {
fprintf ( stderr , " warning: SetProcessWorkingSetSize failed: %s \n " ,
llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
return false ;
}
}
}
static void raw_unlock ( void * ptr , size_t len ) {
if ( ! VirtualUnlock ( ptr , len ) ) {
fprintf ( stderr , " warning: failed to VirtualUnlock buffer: %s \n " ,
llama_format_win_err ( GetLastError ( ) ) . c_str ( ) ) ;
}
}
# else
static constexpr bool SUPPORTED = false ;
static size_t lock_granularity ( ) {
return ( size_t ) 65536 ;
}
bool raw_lock ( const void * addr , size_t len ) const {
fprintf ( stderr , " warning: mlock not supported on this system \n " ) ;
return false ;
}
static void raw_unlock ( const void * addr , size_t len ) { }
# endif
} ;
typedef void ( * offload_func_t ) ( struct ggml_tensor * tensor ) ;
static void llama_nop ( struct ggml_tensor * tensor ) { // don't offload by default
( void ) tensor ;
}
2023-08-27 13:19:19 +02:00
static std : : string llama_token_to_str ( const struct llama_context * ctx , llama_token token ) {
2023-08-21 22:07:43 +02:00
std : : vector < char > result ( 8 , 0 ) ;
2023-09-28 21:42:38 +02:00
const int n_tokens = llama_token_to_piece ( llama_get_model ( ctx ) , token , result . data ( ) , result . size ( ) ) ;
2023-08-21 22:07:43 +02:00
if ( n_tokens < 0 ) {
result . resize ( - n_tokens ) ;
2023-09-28 21:42:38 +02:00
int check = llama_token_to_piece ( llama_get_model ( ctx ) , token , result . data ( ) , result . size ( ) ) ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( check = = - n_tokens ) ;
} else {
result . resize ( n_tokens ) ;
}
return std : : string ( result . data ( ) , result . size ( ) ) ;
}
//
// globals
//
struct llama_state {
// We save the log callback globally
2023-09-27 17:48:33 +02:00
ggml_log_callback log_callback = llama_log_callback_default ;
2023-08-21 22:07:43 +02:00
void * log_callback_user_data = nullptr ;
} ;
static llama_state g_state ;
// available llama models
enum e_model {
MODEL_UNKNOWN ,
2023-09-15 21:02:13 +02:00
MODEL_1B ,
2023-08-21 22:07:43 +02:00
MODEL_3B ,
MODEL_7B ,
MODEL_13B ,
2023-09-15 21:02:13 +02:00
MODEL_15B ,
2023-08-21 22:07:43 +02:00
MODEL_30B ,
2023-08-24 17:44:11 +02:00
MODEL_34B ,
2023-08-23 22:08:04 +02:00
MODEL_40B ,
2023-08-21 22:07:43 +02:00
MODEL_65B ,
MODEL_70B ,
} ;
static const size_t kB = 1024 ;
2023-08-23 22:08:04 +02:00
static const size_t MB = kB * kB ;
2023-09-17 14:33:28 +02:00
static const size_t GB = kB * kB * kB ;
2023-08-21 22:07:43 +02:00
2023-03-22 06:32:36 +01:00
struct llama_hparams {
2023-09-28 21:42:38 +02:00
bool vocab_only ;
2023-09-20 18:12:47 +02:00
uint32_t n_vocab ;
uint32_t n_ctx_train ; // context size the model was trained on
uint32_t n_embd ;
uint32_t n_head ;
uint32_t n_head_kv ;
uint32_t n_layer ;
uint32_t n_rot ;
uint32_t n_ff ;
float f_norm_eps ;
float f_norm_rms_eps ;
2023-09-28 21:42:38 +02:00
float rope_freq_base_train ;
float rope_freq_scale_train ;
llama : add custom RoPE (#2054)
* Implement customizable RoPE
The original RoPE has pre-defined parameters
theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]
Our customizable RoPE, ggml_rope_custom_inplace, uses
theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]
with the default matches the original
scale = 1.0
base = 10000
The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.
Recent researches show changing these two parameters extends the context limit with minimal loss.
1. Extending Context to 8K
kaiokendev
https://kaiokendev.github.io/til#extending-context-to-8k
2. Extending Context Window of Large Language Models via Positional Interpolation
Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
https://arxiv.org/abs/2306.15595
3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
https://www.reddit.com/user/bloc97
https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
* ggml-metal: fix custom rope
* common: fix argument names in help
* llama: increase MEM_REQ_EVAL for MODEL_3B
It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.
* llama: make MEM_REQ_EVAL depend on n_ctx
* server: use proper Content-Type in curl examples
Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded
Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
With Content-Type: application/json, we can send large json data.
* style : minor fixes, mostly indentations
* ggml : fix asserts
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-15 12:34:16 +02:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
bool operator ! = ( const llama_hparams & other ) const {
2023-10-06 12:47:59 +02:00
if ( this - > vocab_only ! = other . vocab_only ) return true ;
if ( this - > n_vocab ! = other . n_vocab ) return true ;
if ( this - > n_ctx_train ! = other . n_ctx_train ) return true ;
if ( this - > n_embd ! = other . n_embd ) return true ;
if ( this - > n_head ! = other . n_head ) return true ;
if ( this - > n_head_kv ! = other . n_head_kv ) return true ;
if ( this - > n_layer ! = other . n_layer ) return true ;
if ( this - > n_rot ! = other . n_rot ) return true ;
if ( this - > n_ff ! = other . n_ff ) return true ;
const float EPSILON = 1e-9 ;
if ( ! is_float_close ( this - > f_norm_eps , other . f_norm_eps , EPSILON ) ) return true ;
if ( ! is_float_close ( this - > f_norm_rms_eps , other . f_norm_rms_eps , EPSILON ) ) return true ;
if ( ! is_float_close ( this - > rope_freq_base_train , other . rope_freq_base_train , EPSILON ) ) return true ;
if ( ! is_float_close ( this - > rope_freq_scale_train , other . rope_freq_scale_train , EPSILON ) ) return true ;
return false ;
2023-07-23 14:09:47 +02:00
}
uint32_t n_gqa ( ) const {
return n_head / n_head_kv ;
}
uint32_t n_embd_head ( ) const {
return n_embd / n_head ;
}
uint32_t n_embd_gqa ( ) const {
return n_embd / n_gqa ( ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-09-28 21:42:38 +02:00
} ;
2023-07-22 20:17:57 +02:00
2023-09-28 21:42:38 +02:00
struct llama_cparams {
uint32_t n_ctx ; // context size used during inference
uint32_t n_batch ;
uint32_t n_threads ; // number of threads to use for generation
uint32_t n_threads_batch ; // number of threads to use for batch processing
float rope_freq_base ;
float rope_freq_scale ;
bool mul_mat_q ;
2023-03-22 06:32:36 +01:00
} ;
struct llama_layer {
// normalization
2023-08-23 22:08:04 +02:00
struct ggml_tensor * attn_norm ;
struct ggml_tensor * attn_norm_b ;
struct ggml_tensor * attn_norm_2 ;
struct ggml_tensor * attn_norm_2_b ;
2023-03-22 06:32:36 +01:00
// attention
struct ggml_tensor * wq ;
struct ggml_tensor * wk ;
struct ggml_tensor * wv ;
struct ggml_tensor * wo ;
2023-08-23 22:08:04 +02:00
struct ggml_tensor * wqkv ;
2023-03-22 06:32:36 +01:00
2023-09-15 21:02:13 +02:00
// attention bias
struct ggml_tensor * bo ;
struct ggml_tensor * bqkv ;
2023-03-22 06:32:36 +01:00
// normalization
struct ggml_tensor * ffn_norm ;
2023-09-15 21:02:13 +02:00
struct ggml_tensor * ffn_norm_b ;
2023-03-22 06:32:36 +01:00
// ff
2023-08-23 22:08:04 +02:00
struct ggml_tensor * w1 ; // ffn_gate
struct ggml_tensor * w2 ; // ffn_down
struct ggml_tensor * w3 ; // ffn_up
2023-09-15 21:02:13 +02:00
// ff bias
struct ggml_tensor * b2 ; // ffn_down
struct ggml_tensor * b3 ; // ffn_up
2023-03-22 06:32:36 +01:00
} ;
2023-09-28 18:04:36 +02:00
struct llama_kv_cell {
llama_pos pos = - 1 ;
llama_pos delta = 0 ;
std : : set < llama_seq_id > seq_id ;
bool has_seq_id ( const llama_seq_id & id ) const {
return seq_id . find ( id ) ! = seq_id . end ( ) ;
}
} ;
// ring-buffer of cached KV data
2023-03-24 22:17:37 +01:00
struct llama_kv_cache {
2023-09-28 18:04:36 +02:00
bool has_shift = false ;
uint32_t head = 0 ;
uint32_t size = 0 ;
// computed before each graph build
uint32_t n = 0 ;
std : : vector < llama_kv_cell > cells ;
2023-07-03 20:43:55 +02:00
struct ggml_tensor * k = NULL ;
struct ggml_tensor * v = NULL ;
2023-03-24 22:17:37 +01:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
struct ggml_context * ctx = NULL ;
2023-03-24 22:17:37 +01:00
2023-08-21 22:07:43 +02:00
llama_buffer buf ;
2023-03-24 22:17:37 +01:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
~ llama_kv_cache ( ) {
if ( ctx ) {
ggml_free ( ctx ) ;
}
2023-06-14 19:47:19 +02:00
# ifdef GGML_USE_CUBLAS
ggml_cuda_free_data ( k ) ;
ggml_cuda_free_data ( v ) ;
# endif // GGML_USE_CUBLAS
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-03-24 22:17:37 +01:00
} ;
2023-06-24 10:47:58 +02:00
struct llama_vocab {
using id = int32_t ;
using token = std : : string ;
2023-08-21 22:07:43 +02:00
using ttype = llama_token_type ;
2023-06-24 10:47:58 +02:00
2023-08-21 22:07:43 +02:00
struct token_data {
token text ;
2023-06-24 10:47:58 +02:00
float score ;
2023-08-21 22:07:43 +02:00
ttype type ;
2023-06-24 10:47:58 +02:00
} ;
2023-08-23 22:08:04 +02:00
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM ;
2023-08-21 22:07:43 +02:00
2023-06-24 10:47:58 +02:00
std : : unordered_map < token , id > token_to_id ;
2023-08-21 22:07:43 +02:00
std : : vector < token_data > id_to_token ;
2023-08-23 22:08:04 +02:00
std : : map < std : : pair < std : : string , std : : string > , int > bpe_ranks ;
2023-08-21 22:07:43 +02:00
// default LLaMA special tokens
id special_bos_id = 1 ;
id special_eos_id = 2 ;
2023-08-23 01:39:39 +02:00
id special_unk_id = 0 ;
2023-08-21 22:07:43 +02:00
id special_sep_id = - 1 ;
id special_pad_id = - 1 ;
id linefeed_id = 13 ;
2023-10-02 09:42:02 +02:00
id special_prefix_id = 32007 ;
id special_middle_id = 32009 ;
id special_suffix_id = 32008 ;
id special_eot_id = 32010 ;
2023-08-23 22:08:04 +02:00
int find_bpe_rank ( std : : string token_left , std : : string token_right ) const {
2023-08-26 20:27:07 +02:00
replace_all ( token_left , " " , " \u0120 " ) ;
replace_all ( token_left , " \n " , " \u010A " ) ;
replace_all ( token_right , " " , " \u0120 " ) ;
replace_all ( token_right , " \n " , " \u010A " ) ;
2023-08-23 22:08:04 +02:00
auto it = bpe_ranks . find ( std : : make_pair ( token_left , token_right ) ) ;
if ( it = = bpe_ranks . end ( ) ) {
return - 1 ;
}
return it - > second ;
}
2023-06-24 10:47:58 +02:00
} ;
2023-03-22 06:32:36 +01:00
struct llama_model {
2023-08-21 22:07:43 +02:00
e_model type = MODEL_UNKNOWN ;
2023-08-23 22:08:04 +02:00
llm_arch arch = LLM_ARCH_UNKNOWN ;
2023-08-21 22:07:43 +02:00
llama_ftype ftype = LLAMA_FTYPE_ALL_F32 ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
std : : string name = " n/a " ;
2023-09-20 18:12:47 +02:00
llama_hparams hparams = { } ;
2023-08-21 22:07:43 +02:00
llama_vocab vocab ;
2023-03-22 06:32:36 +01:00
struct ggml_tensor * tok_embeddings ;
2023-09-15 21:02:13 +02:00
struct ggml_tensor * pos_embeddings ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
struct ggml_tensor * output_norm ;
struct ggml_tensor * output_norm_b ;
2023-03-22 06:32:36 +01:00
struct ggml_tensor * output ;
std : : vector < llama_layer > layers ;
2023-08-23 22:08:04 +02:00
2023-06-06 21:33:23 +02:00
int n_gpu_layers ;
2023-03-22 06:32:36 +01:00
2023-03-24 22:17:37 +01:00
// context
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
struct ggml_context * ctx = NULL ;
2023-03-24 22:17:37 +01:00
// the model memory buffer
2023-08-21 22:07:43 +02:00
llama_buffer buf ;
2023-03-24 22:17:37 +01:00
2023-03-29 08:31:26 +02:00
// model memory mapped file
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
std : : unique_ptr < llama_mmap > mapping ;
2023-03-29 08:31:26 +02:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
// objects representing data potentially being locked in memory
llama_mlock mlock_buf ;
llama_mlock mlock_mmap ;
// for quantize-stats only
std : : vector < std : : pair < std : : string , struct ggml_tensor * > > tensors_by_name ;
2023-06-24 10:47:58 +02:00
int64_t t_load_us = 0 ;
int64_t t_start_us = 0 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
~ llama_model ( ) {
if ( ctx ) {
ggml_free ( ctx ) ;
}
2023-06-06 21:33:23 +02:00
# ifdef GGML_USE_CUBLAS
for ( size_t i = 0 ; i < tensors_by_name . size ( ) ; + + i ) {
ggml_cuda_free_data ( tensors_by_name [ i ] . second ) ;
}
2023-06-14 19:47:19 +02:00
ggml_cuda_free_scratch ( ) ;
2023-06-09 18:24:40 +02:00
# elif defined(GGML_USE_CLBLAST)
for ( size_t i = 0 ; i < tensors_by_name . size ( ) ; + + i ) {
ggml_cl_free_data ( tensors_by_name [ i ] . second ) ;
}
# endif
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-03-22 06:32:36 +01:00
} ;
struct llama_context {
2023-09-28 21:42:38 +02:00
llama_context ( const llama_model & model ) : model ( model ) , t_start_us ( model . t_start_us ) , t_load_us ( model . t_load_us ) { }
2023-07-01 20:14:59 +02:00
~ llama_context ( ) {
2023-07-30 15:58:01 +02:00
# ifdef GGML_USE_METAL
2023-07-01 20:14:59 +02:00
if ( ctx_metal ) {
ggml_metal_free ( ctx_metal ) ;
}
# endif
2023-07-30 15:58:01 +02:00
if ( alloc ) {
ggml_allocr_free ( alloc ) ;
}
}
2023-09-28 21:42:38 +02:00
llama_cparams cparams ;
const llama_model & model ;
// key + value cache for the self attention
struct llama_kv_cache kv_self ;
2023-03-22 06:32:36 +01:00
std : : mt19937 rng ;
2023-03-29 22:51:37 +02:00
bool has_evaluated_once = false ;
2023-03-22 06:32:36 +01:00
2023-09-28 21:42:38 +02:00
int64_t t_start_us ;
int64_t t_load_us ;
2023-03-22 06:32:36 +01:00
int64_t t_sample_us = 0 ;
2023-03-25 15:34:23 +01:00
int64_t t_p_eval_us = 0 ;
2023-09-28 21:42:38 +02:00
int64_t t_eval_us = 0 ;
2023-03-22 06:32:36 +01:00
int32_t n_sample = 0 ; // number of tokens sampled
2023-03-25 15:34:23 +01:00
int32_t n_p_eval = 0 ; // number of tokens in eval calls for the prompt (with batch size > 1)
2023-09-28 21:42:38 +02:00
int32_t n_eval = 0 ; // number of eval calls
2023-03-22 06:32:36 +01:00
// decode output (2-dimensional array: [n_tokens][n_vocab])
std : : vector < float > logits ;
bool logits_all = false ;
2023-03-24 16:05:13 +01:00
// input embedding (1-dimensional array: [n_embd])
std : : vector < float > embedding ;
2023-03-24 22:17:37 +01:00
2023-07-07 18:24:01 +02:00
// reusable buffer for `struct ggml_graph_plan.work_data`
std : : vector < uint8_t > work_buffer ;
2023-03-24 22:17:37 +01:00
// memory buffers used to evaluate the model
2023-08-21 22:07:43 +02:00
llama_buffer buf_compute ;
2023-07-30 15:58:01 +02:00
2023-08-21 22:07:43 +02:00
llama_buffer buf_alloc ;
2023-07-30 15:58:01 +02:00
ggml_allocr * alloc = NULL ;
2023-03-24 22:17:37 +01:00
2023-06-04 22:34:30 +02:00
# ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL ;
# endif
2023-07-10 17:49:56 +02:00
# ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL ;
# endif
2023-03-22 06:32:36 +01:00
} ;
2023-08-21 22:07:43 +02:00
//
// kv cache helpers
//
static bool llama_kv_cache_init (
const struct llama_hparams & hparams ,
struct llama_kv_cache & cache ,
ggml_type wtype ,
2023-09-28 21:42:38 +02:00
uint32_t n_ctx ,
2023-08-21 22:07:43 +02:00
int n_gpu_layers ) {
2023-09-28 18:04:36 +02:00
const uint32_t n_embd = hparams . n_embd_gqa ( ) ;
const uint32_t n_layer = hparams . n_layer ;
2023-08-21 22:07:43 +02:00
const int64_t n_mem = n_layer * n_ctx ;
const int64_t n_elements = n_embd * n_mem ;
2023-09-28 18:04:36 +02:00
cache . has_shift = false ;
cache . head = 0 ;
cache . size = n_ctx ;
cache . cells . clear ( ) ;
cache . cells . resize ( n_ctx ) ;
2023-08-21 22:07:43 +02:00
cache . buf . resize ( 2u * n_elements * ggml_type_size ( wtype ) + 2u * MB ) ;
struct ggml_init_params params ;
params . mem_size = cache . buf . size ;
params . mem_buffer = cache . buf . data ;
params . no_alloc = false ;
cache . ctx = ggml_init ( params ) ;
2023-08-09 22:46:40 +02:00
2023-08-21 22:07:43 +02:00
if ( ! cache . ctx ) {
LLAMA_LOG_ERROR ( " %s: failed to allocate memory for kv cache \n " , __func__ ) ;
return false ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
cache . k = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
cache . v = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
ggml_set_name ( cache . k , " cache_k " ) ;
ggml_set_name ( cache . v , " cache_v " ) ;
( void ) n_gpu_layers ;
# ifdef GGML_USE_CUBLAS
2023-09-28 21:42:38 +02:00
size_t vram_kv_cache = 0 ;
2023-09-28 18:04:36 +02:00
if ( n_gpu_layers > ( int ) n_layer + 1 ) {
2023-08-21 22:07:43 +02:00
ggml_cuda_assign_buffers_no_scratch ( cache . v ) ;
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: offloading v cache to GPU \n " , __func__ ) ;
vram_kv_cache + = ggml_nbytes ( cache . v ) ;
2023-08-21 22:07:43 +02:00
}
2023-09-28 18:04:36 +02:00
if ( n_gpu_layers > ( int ) n_layer + 2 ) {
2023-08-21 22:07:43 +02:00
ggml_cuda_assign_buffers_no_scratch ( cache . k ) ;
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: offloading k cache to GPU \n " , __func__ ) ;
vram_kv_cache + = ggml_nbytes ( cache . k ) ;
}
if ( vram_kv_cache > 0 ) {
LLAMA_LOG_INFO ( " %s: VRAM kv self = %.2f MB \n " , __func__ , vram_kv_cache / 1024.0 / 1024.0 ) ;
2023-08-21 22:07:43 +02:00
}
# endif // GGML_USE_CUBLAS
return true ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-09-28 18:04:36 +02:00
// find an empty slot of size "n_tokens" in the cache
// updates the cache head
static bool llama_kv_cache_find_slot (
2023-10-03 20:04:01 +02:00
struct llama_kv_cache & cache ,
const struct llama_batch & batch ) {
2023-09-28 18:04:36 +02:00
const uint32_t n_ctx = cache . size ;
const uint32_t n_tokens = batch . n_tokens ;
if ( n_tokens > n_ctx ) {
LLAMA_LOG_ERROR ( " %s: n_tokens=%d > n_ctx=%d \n " , __func__ , n_tokens , n_ctx ) ;
return false ;
}
uint32_t n_tested = 0 ;
while ( true ) {
if ( cache . head + n_tokens > n_ctx ) {
cache . head = 0 ;
n_tested + = n_ctx - cache . head ;
continue ;
}
bool found = true ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
if ( cache . cells [ cache . head + i ] . pos > = 0 ) {
found = false ;
cache . head + = i + 1 ;
n_tested + = i + 1 ;
break ;
}
}
if ( found ) {
break ;
}
if ( n_tested > = n_ctx ) {
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
return false ;
}
}
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
cache . cells [ cache . head + i ] . pos = batch . pos [ i ] ;
cache . cells [ cache . head + i ] . seq_id . insert ( batch . seq_id [ i ] ) ;
}
return true ;
}
// find how many cells are currently in use
static int32_t llama_kv_cache_cell_max ( const struct llama_kv_cache & cache ) {
for ( uint32_t i = cache . size - 1 ; i > 0 ; - - i ) {
if ( cache . cells [ i ] . pos > = 0 & & ! cache . cells [ i ] . seq_id . empty ( ) ) {
return i + 1 ;
}
}
return 0 ;
}
static void llama_kv_cache_tokens_rm ( struct llama_kv_cache & cache , int32_t c0 , int32_t c1 ) {
if ( c0 < 0 ) c0 = 0 ;
if ( c1 < 0 ) c1 = cache . size ;
for ( int32_t i = c0 ; i < c1 ; + + i ) {
cache . cells [ i ] . pos = - 1 ;
cache . cells [ i ] . seq_id . clear ( ) ;
}
}
static void llama_kv_cache_seq_rm (
2023-10-03 20:04:01 +02:00
struct llama_kv_cache & cache ,
llama_seq_id seq_id ,
llama_pos p0 ,
llama_pos p1 ) {
if ( p0 < 0 ) p0 = 0 ;
if ( p1 < 0 ) p1 = std : : numeric_limits < llama_pos > : : max ( ) ;
2023-09-28 18:04:36 +02:00
for ( uint32_t i = 0 ; i < cache . size ; + + i ) {
if ( cache . cells [ i ] . has_seq_id ( seq_id ) & & cache . cells [ i ] . pos > = p0 & & cache . cells [ i ] . pos < p1 ) {
cache . cells [ i ] . seq_id . erase ( seq_id ) ;
if ( cache . cells [ i ] . seq_id . empty ( ) ) {
cache . cells [ i ] . pos = - 1 ;
}
}
}
}
static void llama_kv_cache_seq_cp (
2023-10-03 20:04:01 +02:00
struct llama_kv_cache & cache ,
llama_seq_id seq_id_src ,
llama_seq_id seq_id_dst ,
llama_pos p0 ,
llama_pos p1 ) {
if ( p0 < 0 ) p0 = 0 ;
if ( p1 < 0 ) p1 = std : : numeric_limits < llama_pos > : : max ( ) ;
2023-09-28 18:04:36 +02:00
for ( uint32_t i = 0 ; i < cache . size ; + + i ) {
if ( cache . cells [ i ] . has_seq_id ( seq_id_src ) & & cache . cells [ i ] . pos > = p0 & & cache . cells [ i ] . pos < p1 ) {
cache . cells [ i ] . seq_id . insert ( seq_id_dst ) ;
}
}
}
static void llama_kv_cache_seq_keep ( struct llama_kv_cache & cache , llama_seq_id seq_id ) {
for ( uint32_t i = 0 ; i < cache . size ; + + i ) {
if ( ! cache . cells [ i ] . has_seq_id ( seq_id ) ) {
cache . cells [ i ] . pos = - 1 ;
cache . cells [ i ] . seq_id . clear ( ) ;
}
}
}
static void llama_kv_cache_seq_shift (
2023-10-03 20:04:01 +02:00
struct llama_kv_cache & cache ,
llama_seq_id seq_id ,
llama_pos p0 ,
llama_pos p1 ,
llama_pos delta ) {
if ( p0 < 0 ) p0 = 0 ;
if ( p1 < 0 ) p1 = std : : numeric_limits < llama_pos > : : max ( ) ;
2023-09-28 18:04:36 +02:00
for ( uint32_t i = 0 ; i < cache . size ; + + i ) {
if ( cache . cells [ i ] . has_seq_id ( seq_id ) & & cache . cells [ i ] . pos > = p0 & & cache . cells [ i ] . pos < p1 ) {
cache . cells [ i ] . pos + = delta ;
if ( cache . cells [ i ] . pos < 0 ) {
cache . cells [ i ] . pos = - 1 ;
cache . cells [ i ] . seq_id . clear ( ) ;
} else {
cache . has_shift = true ;
cache . cells [ i ] . delta = delta ;
}
}
}
}
2023-08-21 22:07:43 +02:00
//
// model loading and saving
//
2023-08-23 22:08:04 +02:00
enum llama_fver {
2023-08-21 22:07:43 +02:00
GGUF_FILE_VERSION_V1 = 1 ,
2023-08-27 13:19:54 +02:00
GGUF_FILE_VERSION_V2 = 2 ,
2023-08-21 22:07:43 +02:00
} ;
2023-08-23 22:08:04 +02:00
static const char * llama_file_version_name ( llama_fver version ) {
2023-08-21 22:07:43 +02:00
switch ( version ) {
2023-08-27 13:19:54 +02:00
case GGUF_FILE_VERSION_V1 : return " GGUF V1 (support until nov 2023) " ;
case GGUF_FILE_VERSION_V2 : return " GGUF V2 (latest) " ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
return " unknown " ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-23 22:08:04 +02:00
static std : : string llama_format_tensor_shape ( const std : : vector < int64_t > & ne ) {
2023-04-16 12:58:48 +02:00
char buf [ 256 ] ;
2023-08-23 22:08:04 +02:00
snprintf ( buf , sizeof ( buf ) , " %5 " PRId64 , ne . at ( 0 ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
for ( size_t i = 1 ; i < ne . size ( ) ; i + + ) {
2023-08-23 22:08:04 +02:00
snprintf ( buf + strlen ( buf ) , sizeof ( buf ) - strlen ( buf ) , " , %5 " PRId64 , ne . at ( i ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-04-16 12:58:48 +02:00
return buf ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
static std : : string llama_format_tensor_shape ( const struct ggml_tensor * t ) {
char buf [ 256 ] ;
snprintf ( buf , sizeof ( buf ) , " %5 " PRId64 , t - > ne [ 0 ] ) ;
for ( int i = 1 ; i < GGML_MAX_DIMS ; i + + ) {
snprintf ( buf + strlen ( buf ) , sizeof ( buf ) - strlen ( buf ) , " , %5 " PRId64 , t - > ne [ i ] ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
return buf ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
struct llama_model_loader {
int n_kv = 0 ;
int n_tensors = 0 ;
int n_created = 0 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
int64_t n_elements = 0 ;
2023-09-17 14:33:28 +02:00
size_t n_bytes = 0 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
bool use_mmap = false ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-23 22:08:04 +02:00
llama_file file ;
2023-08-21 22:07:43 +02:00
llama_ftype ftype ;
2023-08-23 22:08:04 +02:00
llama_fver fver ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
std : : unique_ptr < llama_mmap > mapping ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
struct gguf_context * ctx_gguf = NULL ;
struct ggml_context * ctx_meta = NULL ;
llama_model_loader ( const std : : string & fname , bool use_mmap ) : file ( fname . c_str ( ) , " rb " ) {
struct gguf_init_params params = {
/*.no_alloc = */ true ,
/*.ctx = */ & ctx_meta ,
} ;
ctx_gguf = gguf_init_from_file ( fname . c_str ( ) , params ) ;
if ( ! ctx_gguf ) {
throw std : : runtime_error ( format ( " %s: failed to load model from %s \n " , __func__ , fname . c_str ( ) ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
n_kv = gguf_get_n_kv ( ctx_gguf ) ;
n_tensors = gguf_get_n_tensors ( ctx_gguf ) ;
2023-05-20 14:58:15 +02:00
2023-08-23 22:08:04 +02:00
fver = ( enum llama_fver ) gguf_get_version ( ctx_gguf ) ;
2023-08-21 22:07:43 +02:00
for ( int i = 0 ; i < n_tensors ; i + + ) {
const char * name = gguf_get_tensor_name ( ctx_gguf , i ) ;
struct ggml_tensor * t = ggml_get_tensor ( ctx_meta , name ) ;
n_elements + = ggml_nelements ( t ) ;
2023-09-17 14:33:28 +02:00
n_bytes + = ggml_nbytes ( t ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-05-20 14:58:15 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s) \n " ,
__func__ , n_kv , n_tensors , fname . c_str ( ) , llama_file_version_name ( fver ) ) ;
2023-07-23 14:09:47 +02:00
2023-08-21 22:07:43 +02:00
// determine file type based on the number of tensors for each quantization and print meta data
// TODO: make optional
{
std : : map < enum ggml_type , uint32_t > n_type ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
uint32_t n_type_max = 0 ;
enum ggml_type type_max = GGML_TYPE_F32 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
for ( int i = 0 ; i < n_tensors ; i + + ) {
const char * name = gguf_get_tensor_name ( ctx_gguf , i ) ;
struct ggml_tensor * meta = ggml_get_tensor ( ctx_meta , name ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
n_type [ meta - > type ] + + ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
if ( n_type_max < n_type [ meta - > type ] ) {
n_type_max = n_type [ meta - > type ] ;
type_max = meta - > type ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: - tensor %4d: %32s %-8s [ %s ] \n " , __func__ , i , name , ggml_type_name ( meta - > type ) , llama_format_tensor_shape ( meta ) . c_str ( ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
switch ( type_max ) {
case GGML_TYPE_F32 : ftype = LLAMA_FTYPE_ALL_F32 ; break ;
case GGML_TYPE_F16 : ftype = LLAMA_FTYPE_MOSTLY_F16 ; break ;
case GGML_TYPE_Q4_0 : ftype = LLAMA_FTYPE_MOSTLY_Q4_0 ; break ;
case GGML_TYPE_Q4_1 : ftype = LLAMA_FTYPE_MOSTLY_Q4_1 ; break ;
case GGML_TYPE_Q5_0 : ftype = LLAMA_FTYPE_MOSTLY_Q5_0 ; break ;
case GGML_TYPE_Q5_1 : ftype = LLAMA_FTYPE_MOSTLY_Q5_1 ; break ;
case GGML_TYPE_Q8_0 : ftype = LLAMA_FTYPE_MOSTLY_Q8_0 ; break ;
case GGML_TYPE_Q2_K : ftype = LLAMA_FTYPE_MOSTLY_Q2_K ; break ;
case GGML_TYPE_Q3_K : ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M ; break ;
case GGML_TYPE_Q4_K : ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M ; break ;
case GGML_TYPE_Q5_K : ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M ; break ;
case GGML_TYPE_Q6_K : ftype = LLAMA_FTYPE_MOSTLY_Q6_K ; break ;
default :
{
LLAMA_LOG_WARN ( " %s: unknown type %s \n " , __func__ , ggml_type_name ( type_max ) ) ;
ftype = LLAMA_FTYPE_ALL_F32 ;
} break ;
2023-07-20 12:47:26 +02:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-22 19:05:59 +02:00
// this is a way to mark that we have "guessed" the file type
ftype = ( llama_ftype ) ( ftype | LLAMA_FTYPE_GUESSED ) ;
{
const int kid = gguf_find_key ( ctx_gguf , " general.file_type " ) ;
if ( kid > = 0 ) {
ftype = ( llama_ftype ) gguf_get_val_u32 ( ctx_gguf , kid ) ;
}
}
2023-08-21 22:07:43 +02:00
for ( int i = 0 ; i < n_kv ; i + + ) {
const char * name = gguf_get_key ( ctx_gguf , i ) ;
const enum gguf_type type = gguf_get_kv_type ( ctx_gguf , i ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: - kv %3d: %42s %-8s \n " , __func__ , i , name , gguf_type_name ( type ) ) ;
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
// print type counts
for ( auto & kv : n_type ) {
if ( kv . second = = 0 ) {
continue ;
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: - type %4s: %4d tensors \n " , __func__ , ggml_type_name ( kv . first ) , kv . second ) ;
}
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
if ( ! llama_mmap : : SUPPORTED ) {
2023-08-21 22:07:43 +02:00
LLAMA_LOG_WARN ( " %s: mmap is not supported on this platform \n " , __func__ ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
use_mmap = false ;
}
2023-08-21 22:07:43 +02:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
this - > use_mmap = use_mmap ;
}
2023-08-21 22:07:43 +02:00
~ llama_model_loader ( ) {
if ( ctx_gguf ) {
gguf_free ( ctx_gguf ) ;
}
if ( ctx_meta ) {
ggml_free ( ctx_meta ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
}
2023-08-23 22:08:04 +02:00
std : : string get_arch_name ( ) const {
const auto kv = LLM_KV ( LLM_ARCH_UNKNOWN ) ;
std : : string arch_name ;
GGUF_GET_KEY ( ctx_gguf , arch_name , gguf_get_val_str , GGUF_TYPE_STRING , false , kv ( LLM_KV_GENERAL_ARCHITECTURE ) ) ;
return arch_name ;
}
enum llm_arch get_arch ( ) const {
const std : : string arch_name = get_arch_name ( ) ;
return llm_arch_from_string ( arch_name ) ;
}
2023-08-21 22:07:43 +02:00
const char * get_tensor_name ( int i ) const {
return gguf_get_tensor_name ( ctx_gguf , i ) ;
}
2023-04-17 17:28:55 +02:00
2023-08-21 22:07:43 +02:00
struct ggml_tensor * get_tensor_meta ( int i ) const {
return ggml_get_tensor ( ctx_meta , get_tensor_name ( i ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
void calc_sizes ( size_t & ctx_size_p , size_t & mmapped_size_p ) const {
ctx_size_p = 0 ;
mmapped_size_p = 0 ;
for ( int i = 0 ; i < n_tensors ; i + + ) {
struct ggml_tensor * meta = get_tensor_meta ( i ) ;
ctx_size_p + = sizeof ( struct ggml_tensor ) + GGML_OBJECT_SIZE ;
( use_mmap ? mmapped_size_p : ctx_size_p ) + = ggml_nbytes_pad ( meta ) ;
2023-06-12 14:44:16 +02:00
}
2023-08-21 22:07:43 +02:00
}
struct ggml_tensor * create_tensor_for ( struct ggml_context * ctx , struct ggml_tensor * meta , ggml_backend backend ) {
if ( backend ! = GGML_BACKEND_CPU ) {
ggml_set_no_alloc ( ctx , true ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
struct ggml_tensor * tensor = ggml_dup_tensor ( ctx , meta ) ;
tensor - > backend = backend ; // TODO: ggml_set_backend
ggml_set_name ( tensor , ggml_get_name ( meta ) ) ;
2023-06-06 21:33:23 +02:00
2023-06-12 14:44:16 +02:00
if ( backend ! = GGML_BACKEND_CPU ) {
2023-08-21 22:07:43 +02:00
ggml_set_no_alloc ( ctx , use_mmap ) ;
2023-06-12 14:44:16 +02:00
}
2023-08-21 22:07:43 +02:00
n_created + + ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
return tensor ;
}
2023-08-23 22:08:04 +02:00
struct ggml_tensor * create_tensor ( struct ggml_context * ctx , const std : : string & name , const std : : vector < int64_t > & ne , ggml_backend backend ) {
2023-08-21 22:07:43 +02:00
struct ggml_tensor * cur = ggml_get_tensor ( ctx_meta , name . c_str ( ) ) ;
if ( cur = = NULL ) {
throw std : : runtime_error ( format ( " %s: tensor '%s' not found " , __func__ , name . c_str ( ) ) ) ;
}
{
bool is_ok = true ;
for ( size_t i = 0 ; i < ne . size ( ) ; + + i ) {
if ( ne [ i ] ! = cur - > ne [ i ] ) {
is_ok = false ;
break ;
}
}
if ( ! is_ok ) {
throw std : : runtime_error (
format ( " %s: tensor '%s' has wrong shape; expected %s, got %s " ,
__func__ , name . c_str ( ) ,
llama_format_tensor_shape ( ne ) . c_str ( ) ,
llama_format_tensor_shape ( cur ) . c_str ( ) ) ) ;
}
}
return create_tensor_for ( ctx , cur , backend ) ;
}
2023-05-13 10:23:15 +02:00
void done_getting_tensors ( ) const {
2023-08-21 22:07:43 +02:00
if ( n_created ! = n_tensors ) {
throw std : : runtime_error ( format ( " %s: wrong number of tensors; expected %d, got %d " , __func__ , n_tensors , n_created ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
}
2023-08-21 22:07:43 +02:00
size_t file_offset ( const char * name ) const {
const int idx = gguf_find_tensor ( ctx_gguf , name ) ;
if ( idx < 0 ) {
throw std : : runtime_error ( format ( " %s: tensor '%s' not found in the file " , __func__ , name ) ) ;
}
return gguf_get_data_offset ( ctx_gguf ) + gguf_get_tensor_offset ( ctx_gguf , idx ) ;
}
void load_data_for ( struct ggml_tensor * cur ) const {
const size_t offs = file_offset ( ggml_get_name ( cur ) ) ;
if ( use_mmap ) {
cur - > data = ( uint8_t * ) mapping - > addr + offs ;
} else {
file . seek ( offs , SEEK_SET ) ;
file . read_raw ( cur - > data , ggml_nbytes ( cur ) ) ;
}
}
void load_all_data ( struct ggml_context * ctx , llama_progress_callback progress_callback , void * progress_callback_user_data , llama_mlock * lmlock ) {
size_t size_data = 0 ;
size_t size_lock = 0 ;
size_t size_pref = 0 ; // prefetch
for ( int i = 0 ; i < gguf_get_n_tensors ( ctx_gguf ) ; i + + ) {
struct ggml_tensor * cur = ggml_get_tensor ( ctx , gguf_get_tensor_name ( ctx_gguf , i ) ) ;
size_data + = ggml_nbytes ( cur ) ;
if ( cur - > backend = = GGML_BACKEND_CPU ) {
size_pref + = ggml_nbytes ( cur ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
if ( use_mmap ) {
2023-08-21 22:07:43 +02:00
mapping . reset ( new llama_mmap ( & file , size_pref , ggml_is_numa ( ) ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
if ( lmlock ) {
lmlock - > init ( mapping - > addr ) ;
}
}
size_t done_size = 0 ;
2023-08-21 22:07:43 +02:00
for ( int i = 0 ; i < gguf_get_n_tensors ( ctx_gguf ) ; i + + ) {
struct ggml_tensor * cur = ggml_get_tensor ( ctx , gguf_get_tensor_name ( ctx_gguf , i ) ) ;
GGML_ASSERT ( cur ) ; // unused tensors should have been caught by load_data already
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
if ( progress_callback ) {
2023-08-21 22:07:43 +02:00
progress_callback ( ( float ) done_size / size_data , progress_callback_user_data ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-06-12 14:44:16 +02:00
// allocate temp buffer if not using mmap
2023-08-21 22:07:43 +02:00
if ( ! use_mmap & & cur - > data = = NULL ) {
GGML_ASSERT ( cur - > backend ! = GGML_BACKEND_CPU ) ;
2023-09-08 03:46:56 +02:00
# ifdef GGML_USE_CPU_HBM
cur - > data = ( uint8_t * ) hbw_malloc ( ggml_nbytes ( cur ) ) ;
# else
cur - > data = ( uint8_t * ) malloc ( ggml_nbytes ( cur ) ) ;
# endif
2023-06-12 14:44:16 +02:00
}
2023-08-21 22:07:43 +02:00
load_data_for ( cur ) ;
2023-06-12 14:44:16 +02:00
2023-08-21 22:07:43 +02:00
switch ( cur - > backend ) {
2023-06-12 14:44:16 +02:00
case GGML_BACKEND_CPU :
if ( use_mmap & & lmlock ) {
2023-08-21 22:07:43 +02:00
size_lock + = ggml_nbytes ( cur ) ;
lmlock - > grow_to ( size_lock ) ;
2023-06-12 14:44:16 +02:00
}
break ;
2023-09-28 21:42:38 +02:00
# ifdef GGML_USE_CUBLAS
2023-06-12 14:44:16 +02:00
case GGML_BACKEND_GPU :
case GGML_BACKEND_GPU_SPLIT :
2023-08-21 22:07:43 +02:00
// old code:
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
// TODO: test if this works !!
ggml_cuda_transform_tensor ( cur - > data , cur ) ;
2023-06-12 14:44:16 +02:00
if ( ! use_mmap ) {
2023-08-21 22:07:43 +02:00
free ( cur - > data ) ;
2023-06-12 14:44:16 +02:00
}
break ;
# elif defined(GGML_USE_CLBLAST)
case GGML_BACKEND_GPU :
2023-08-21 22:07:43 +02:00
ggml_cl_transform_tensor ( cur - > data , cur ) ;
2023-06-12 14:44:16 +02:00
if ( ! use_mmap ) {
2023-08-21 22:07:43 +02:00
free ( cur - > data ) ;
2023-06-12 14:44:16 +02:00
}
break ;
# endif
default :
continue ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-06-12 14:44:16 +02:00
2023-08-21 22:07:43 +02:00
done_size + = ggml_nbytes ( cur ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
}
2023-08-21 22:07:43 +02:00
} ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
//
// load LLaMA models
//
2023-06-28 19:13:02 +02:00
2023-09-28 21:42:38 +02:00
static std : : string llama_model_arch_name ( llm_arch arch ) {
auto it = LLM_ARCH_NAMES . find ( arch ) ;
if ( it = = LLM_ARCH_NAMES . end ( ) ) {
return " unknown " ;
}
return it - > second ;
}
static std : : string llama_model_ftype_name ( llama_ftype ftype ) {
2023-08-22 19:05:59 +02:00
if ( ftype & LLAMA_FTYPE_GUESSED ) {
return llama_model_ftype_name ( ( enum llama_ftype ) ( ftype & ~ LLAMA_FTYPE_GUESSED ) ) + " (guessed) " ;
}
2023-08-21 22:07:43 +02:00
switch ( ftype ) {
case LLAMA_FTYPE_ALL_F32 : return " all F32 " ;
case LLAMA_FTYPE_MOSTLY_F16 : return " mostly F16 " ;
case LLAMA_FTYPE_MOSTLY_Q4_0 : return " mostly Q4_0 " ;
case LLAMA_FTYPE_MOSTLY_Q4_1 : return " mostly Q4_1 " ;
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 :
return " mostly Q4_1, some F16 " ;
case LLAMA_FTYPE_MOSTLY_Q5_0 : return " mostly Q5_0 " ;
case LLAMA_FTYPE_MOSTLY_Q5_1 : return " mostly Q5_1 " ;
case LLAMA_FTYPE_MOSTLY_Q8_0 : return " mostly Q8_0 " ;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K : return " mostly Q2_K " ;
case LLAMA_FTYPE_MOSTLY_Q3_K_S : return " mostly Q3_K - Small " ;
case LLAMA_FTYPE_MOSTLY_Q3_K_M : return " mostly Q3_K - Medium " ;
case LLAMA_FTYPE_MOSTLY_Q3_K_L : return " mostly Q3_K - Large " ;
case LLAMA_FTYPE_MOSTLY_Q4_K_S : return " mostly Q4_K - Small " ;
case LLAMA_FTYPE_MOSTLY_Q4_K_M : return " mostly Q4_K - Medium " ;
case LLAMA_FTYPE_MOSTLY_Q5_K_S : return " mostly Q5_K - Small " ;
case LLAMA_FTYPE_MOSTLY_Q5_K_M : return " mostly Q5_K - Medium " ;
case LLAMA_FTYPE_MOSTLY_Q6_K : return " mostly Q6_K " ;
default : return " unknown, may not work " ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
static const char * llama_model_type_name ( e_model type ) {
switch ( type ) {
2023-09-15 21:02:13 +02:00
case MODEL_1B : return " 1B " ;
2023-08-21 22:07:43 +02:00
case MODEL_3B : return " 3B " ;
case MODEL_7B : return " 7B " ;
case MODEL_13B : return " 13B " ;
2023-09-15 21:02:13 +02:00
case MODEL_15B : return " 15B " ;
2023-08-21 22:07:43 +02:00
case MODEL_30B : return " 30B " ;
2023-08-24 17:44:11 +02:00
case MODEL_34B : return " 34B " ;
2023-08-23 22:08:04 +02:00
case MODEL_40B : return " 40B " ;
2023-08-21 22:07:43 +02:00
case MODEL_65B : return " 65B " ;
case MODEL_70B : return " 70B " ;
2023-08-23 22:08:04 +02:00
default : return " ?B " ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-08-21 22:07:43 +02:00
}
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-23 22:08:04 +02:00
static void llm_load_arch ( llama_model_loader & ml , llama_model & model ) {
model . arch = ml . get_arch ( ) ;
if ( model . arch = = LLM_ARCH_UNKNOWN ) {
throw std : : runtime_error ( " unknown model architecture: ' " + ml . get_arch_name ( ) + " ' " ) ;
}
}
static void llm_load_hparams (
llama_model_loader & ml ,
2023-09-28 21:42:38 +02:00
llama_model & model ) {
2023-08-23 22:08:04 +02:00
struct gguf_context * ctx = ml . ctx_gguf ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-23 22:08:04 +02:00
const auto kv = LLM_KV ( model . arch ) ;
2023-03-24 22:17:37 +01:00
2023-08-21 22:07:43 +02:00
auto & hparams = model . hparams ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
// get general kv
GGUF_GET_KEY ( ctx , model . name , gguf_get_val_str , GGUF_TYPE_STRING , false , kv ( LLM_KV_GENERAL_NAME ) ) ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
// get hparams kv
2023-09-28 21:42:38 +02:00
GGUF_GET_KEY ( ctx , hparams . n_vocab , gguf_get_arr_n , GGUF_TYPE_ARRAY , true , kv ( LLM_KV_TOKENIZER_LIST ) ) ;
GGUF_GET_KEY ( ctx , hparams . n_ctx_train , gguf_get_val_u32 , GGUF_TYPE_UINT32 , true , kv ( LLM_KV_CONTEXT_LENGTH ) ) ;
GGUF_GET_KEY ( ctx , hparams . n_embd , gguf_get_val_u32 , GGUF_TYPE_UINT32 , true , kv ( LLM_KV_EMBEDDING_LENGTH ) ) ;
GGUF_GET_KEY ( ctx , hparams . n_ff , gguf_get_val_u32 , GGUF_TYPE_UINT32 , true , kv ( LLM_KV_FEED_FORWARD_LENGTH ) ) ;
GGUF_GET_KEY ( ctx , hparams . n_head , gguf_get_val_u32 , GGUF_TYPE_UINT32 , true , kv ( LLM_KV_ATTENTION_HEAD_COUNT ) ) ;
GGUF_GET_KEY ( ctx , hparams . n_layer , gguf_get_val_u32 , GGUF_TYPE_UINT32 , true , kv ( LLM_KV_BLOCK_COUNT ) ) ;
2023-08-23 22:08:04 +02:00
// n_head_kv is optional, default to n_head
hparams . n_head_kv = hparams . n_head ;
GGUF_GET_KEY ( ctx , hparams . n_head_kv , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_ATTENTION_HEAD_COUNT_KV ) ) ;
2023-09-20 18:12:47 +02:00
// rope_freq_base (optional)
2023-09-28 21:42:38 +02:00
hparams . rope_freq_base_train = 10000.0f ;
GGUF_GET_KEY ( ctx , hparams . rope_freq_base_train , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , false , kv ( LLM_KV_ROPE_FREQ_BASE ) ) ;
2023-08-24 20:04:05 +02:00
2023-08-23 22:08:04 +02:00
// rope_freq_scale (inverse of the kv) is optional
2023-09-28 21:42:38 +02:00
float ropescale = 1.0f ;
GGUF_GET_KEY ( ctx , ropescale , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , false , kv ( LLM_KV_ROPE_SCALE_LINEAR ) ) ;
hparams . rope_freq_scale_train = 1.0f / ropescale ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
// sanity check for n_rot (optional)
{
hparams . n_rot = hparams . n_embd / hparams . n_head ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
GGUF_GET_KEY ( ctx , hparams . n_rot , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_ROPE_DIMENSION_COUNT ) ) ;
2023-09-03 07:36:28 +02:00
if ( model . arch = = LLM_ARCH_LLAMA | | model . arch = = LLM_ARCH_FALCON ) {
if ( hparams . n_rot ! = hparams . n_embd / hparams . n_head ) {
throw std : : runtime_error ( format ( " invalid n_rot: %u, expected %u " , hparams . n_rot , hparams . n_embd / hparams . n_head ) ) ;
}
2023-08-21 22:07:43 +02:00
}
2023-09-03 07:36:28 +02:00
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim
2023-08-23 22:08:04 +02:00
}
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
// arch-specific KVs
switch ( model . arch ) {
case LLM_ARCH_LLAMA :
{
GGUF_GET_KEY ( ctx , hparams . f_norm_rms_eps , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , true , kv ( LLM_KV_ATTENTION_LAYERNORM_RMS_EPS ) ) ;
switch ( hparams . n_layer ) {
case 26 : model . type = e_model : : MODEL_3B ; break ;
case 32 : model . type = e_model : : MODEL_7B ; break ;
case 40 : model . type = e_model : : MODEL_13B ; break ;
2023-08-24 17:44:11 +02:00
case 48 : model . type = e_model : : MODEL_34B ; break ;
2023-08-23 22:08:04 +02:00
case 60 : model . type = e_model : : MODEL_30B ; break ;
case 80 : model . type = hparams . n_head = = hparams . n_head_kv ? e_model : : MODEL_65B : e_model : : MODEL_70B ; break ;
default : model . type = e_model : : MODEL_UNKNOWN ;
}
} break ;
case LLM_ARCH_FALCON :
{
GGUF_GET_KEY ( ctx , hparams . f_norm_eps , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , true , kv ( LLM_KV_ATTENTION_LAYERNORM_EPS ) ) ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
switch ( hparams . n_layer ) {
case 32 : model . type = e_model : : MODEL_7B ; break ;
case 60 : model . type = e_model : : MODEL_40B ; break ;
default : model . type = e_model : : MODEL_UNKNOWN ;
}
} break ;
2023-09-14 18:32:10 +02:00
case LLM_ARCH_BAICHUAN :
{
GGUF_GET_KEY ( ctx , hparams . f_norm_rms_eps , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , true , kv ( LLM_KV_ATTENTION_LAYERNORM_RMS_EPS ) ) ;
switch ( hparams . n_layer ) {
case 32 : model . type = e_model : : MODEL_7B ; break ;
case 40 : model . type = e_model : : MODEL_13B ; break ;
default : model . type = e_model : : MODEL_UNKNOWN ;
}
} break ;
2023-09-15 21:02:13 +02:00
case LLM_ARCH_STARCODER :
{
GGUF_GET_KEY ( ctx , hparams . f_norm_eps , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , true , kv ( LLM_KV_ATTENTION_LAYERNORM_EPS ) ) ;
switch ( hparams . n_layer ) {
case 24 : model . type = e_model : : MODEL_1B ; break ;
case 36 : model . type = e_model : : MODEL_3B ; break ;
case 42 : model . type = e_model : : MODEL_7B ; break ;
case 40 : model . type = e_model : : MODEL_15B ; break ;
default : model . type = e_model : : MODEL_UNKNOWN ;
}
} break ;
2023-10-04 15:23:39 +02:00
case LLM_ARCH_REFACT :
{
GGUF_GET_KEY ( ctx , hparams . f_norm_rms_eps , gguf_get_val_f32 , GGUF_TYPE_FLOAT32 , true , kv ( LLM_KV_ATTENTION_LAYERNORM_RMS_EPS ) ) ;
switch ( hparams . n_layer ) {
case 32 : model . type = e_model : : MODEL_1B ; break ;
default : model . type = e_model : : MODEL_UNKNOWN ;
}
} break ;
2023-08-23 22:08:04 +02:00
default : ( void ) 0 ;
2023-09-28 23:41:44 +02:00
}
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
model . ftype = ml . ftype ;
}
2023-06-14 19:47:19 +02:00
2023-08-23 22:08:04 +02:00
// TODO: This should probably be in llama.h
2023-08-27 13:19:19 +02:00
static std : : vector < llama_vocab : : id > llama_tokenize_internal ( const llama_vocab & vocab , std : : string raw_text , bool bos ) ;
static llama_token llama_byte_to_token ( const llama_vocab & vocab , uint8_t ch ) ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
static void llm_load_vocab (
llama_model_loader & ml ,
llama_model & model ) {
auto & vocab = model . vocab ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
struct gguf_context * ctx = ml . ctx_gguf ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
const auto kv = LLM_KV ( model . arch ) ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
const int token_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_LIST ) . c_str ( ) ) ;
if ( token_idx = = - 1 ) {
throw std : : runtime_error ( " cannot find tokenizer vocab in model file \n " ) ;
}
llama : add custom RoPE (#2054)
* Implement customizable RoPE
The original RoPE has pre-defined parameters
theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]
Our customizable RoPE, ggml_rope_custom_inplace, uses
theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]
with the default matches the original
scale = 1.0
base = 10000
The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.
Recent researches show changing these two parameters extends the context limit with minimal loss.
1. Extending Context to 8K
kaiokendev
https://kaiokendev.github.io/til#extending-context-to-8k
2. Extending Context Window of Large Language Models via Positional Interpolation
Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
https://arxiv.org/abs/2306.15595
3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
https://www.reddit.com/user/bloc97
https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
* ggml-metal: fix custom rope
* common: fix argument names in help
* llama: increase MEM_REQ_EVAL for MODEL_3B
It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.
* llama: make MEM_REQ_EVAL depend on n_ctx
* server: use proper Content-Type in curl examples
Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded
Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
With Content-Type: application/json, we can send large json data.
* style : minor fixes, mostly indentations
* ggml : fix asserts
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-15 12:34:16 +02:00
2023-09-28 20:30:15 +02:00
const float * scores = nullptr ;
2023-08-23 22:08:04 +02:00
const int score_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_SCORES ) . c_str ( ) ) ;
2023-09-28 20:30:15 +02:00
if ( score_idx ! = - 1 ) {
scores = ( const float * ) gguf_get_arr_data ( ctx , score_idx ) ;
2023-08-23 22:08:04 +02:00
}
2023-08-21 22:07:43 +02:00
2023-09-28 20:30:15 +02:00
const int * toktypes = nullptr ;
2023-08-23 22:08:04 +02:00
const int toktype_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_TOKEN_TYPE ) . c_str ( ) ) ;
2023-09-28 20:30:15 +02:00
if ( toktype_idx ! = - 1 ) {
toktypes = ( const int * ) gguf_get_arr_data ( ctx , toktype_idx ) ;
2023-04-08 22:08:21 +02:00
}
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
// determine vocab type
2023-04-08 22:08:21 +02:00
{
2023-08-23 22:08:04 +02:00
std : : string tokenizer_name ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
GGUF_GET_KEY ( ctx , tokenizer_name , gguf_get_val_str , GGUF_TYPE_STRING , true , kv ( LLM_KV_TOKENIZER_MODEL ) ) ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
if ( tokenizer_name = = " llama " ) {
vocab . type = LLAMA_VOCAB_TYPE_SPM ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
// default special tokens
vocab . special_bos_id = 1 ;
vocab . special_eos_id = 2 ;
vocab . special_unk_id = 0 ;
vocab . special_sep_id = - 1 ;
vocab . special_pad_id = - 1 ;
} else if ( tokenizer_name = = " gpt2 " ) {
vocab . type = LLAMA_VOCAB_TYPE_BPE ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_MERGES ) . c_str ( ) ) ;
if ( merges_keyidx = = - 1 ) {
throw std : : runtime_error ( " cannot find tokenizer merges in model file \n " ) ;
}
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
const int n_merges = gguf_get_arr_n ( ctx , merges_keyidx ) ;
2023-05-19 21:17:18 +02:00
2023-08-23 22:08:04 +02:00
for ( int i = 0 ; i < n_merges ; i + + ) {
const std : : string word = gguf_get_arr_str ( ctx , merges_keyidx , i ) ;
2023-10-03 09:16:26 +02:00
GGML_ASSERT ( codepoints_from_utf8 ( word ) . size ( ) > 0 ) ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
std : : string first ;
std : : string second ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
const size_t pos = word . find ( ' ' , 1 ) ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
if ( pos ! = std : : string : : npos ) {
first = word . substr ( 0 , pos ) ;
second = word . substr ( pos + 1 ) ;
}
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
vocab . bpe_ranks . emplace ( std : : make_pair ( first , second ) , i ) ;
2023-08-21 22:07:43 +02:00
}
2023-08-23 22:08:04 +02:00
// default special tokens
vocab . special_bos_id = 11 ;
vocab . special_eos_id = 11 ;
vocab . special_unk_id = - 1 ;
vocab . special_sep_id = - 1 ;
vocab . special_pad_id = - 1 ;
} else {
LLAMA_LOG_WARN ( " %s: unknown tokenizer: '%s' " , __func__ , tokenizer_name . c_str ( ) ) ;
LLAMA_LOG_WARN ( " %s: using default tokenizer: 'llama' " , __func__ ) ;
vocab . type = LLAMA_VOCAB_TYPE_SPM ;
2023-05-11 23:23:08 +02:00
}
}
2023-08-23 22:08:04 +02:00
const uint32_t n_vocab = gguf_get_arr_n ( ctx , token_idx ) ;
vocab . id_to_token . resize ( n_vocab ) ;
for ( uint32_t i = 0 ; i < n_vocab ; i + + ) {
std : : string word = gguf_get_arr_str ( ctx , token_idx , i ) ;
2023-10-03 09:16:26 +02:00
GGML_ASSERT ( codepoints_from_utf8 ( word ) . size ( ) > 0 ) ;
2023-08-23 22:08:04 +02:00
vocab . token_to_id [ word ] = i ;
auto & token_data = vocab . id_to_token [ i ] ;
token_data . text = std : : move ( word ) ;
2023-09-28 20:30:15 +02:00
token_data . score = scores ? scores [ i ] : 0.0f ;
token_data . type = toktypes ? ( llama_token_type ) toktypes [ i ] : LLAMA_TOKEN_TYPE_NORMAL ;
2023-03-29 02:03:43 +02:00
}
2023-10-03 09:16:26 +02:00
GGML_ASSERT ( vocab . id_to_token . size ( ) = = vocab . token_to_id . size ( ) ) ;
2023-03-29 02:03:43 +02:00
2023-08-23 22:08:04 +02:00
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2023-08-27 13:19:19 +02:00
if ( vocab . type = = LLAMA_VOCAB_TYPE_SPM ) {
vocab . linefeed_id = llama_byte_to_token ( vocab , ' \n ' ) ;
} else {
2023-10-03 09:16:26 +02:00
vocab . linefeed_id = llama_tokenize_internal ( vocab , " \u010A " , false ) [ 0 ] ;
2023-08-27 13:19:19 +02:00
}
2023-08-23 22:08:04 +02:00
// special tokens
GGUF_GET_KEY ( ctx , vocab . special_bos_id , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_TOKENIZER_BOS_ID ) ) ;
GGUF_GET_KEY ( ctx , vocab . special_eos_id , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_TOKENIZER_EOS_ID ) ) ;
GGUF_GET_KEY ( ctx , vocab . special_unk_id , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_TOKENIZER_UNK_ID ) ) ;
GGUF_GET_KEY ( ctx , vocab . special_sep_id , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_TOKENIZER_SEP_ID ) ) ;
GGUF_GET_KEY ( ctx , vocab . special_pad_id , gguf_get_val_u32 , GGUF_TYPE_UINT32 , false , kv ( LLM_KV_TOKENIZER_PAD_ID ) ) ;
}
static void llm_load_print_meta ( llama_model_loader & ml , llama_model & model ) {
const auto & hparams = model . hparams ;
const auto & vocab = model . vocab ;
// hparams
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: format = %s \n " , __func__ , llama_file_version_name ( ml . fver ) ) ;
LLAMA_LOG_INFO ( " %s: arch = %s \n " , __func__ , LLM_ARCH_NAMES . at ( model . arch ) . c_str ( ) ) ;
LLAMA_LOG_INFO ( " %s: vocab type = %s \n " , __func__ , vocab . type = = LLAMA_VOCAB_TYPE_SPM ? " SPM " : " BPE " ) ; // TODO: fix
LLAMA_LOG_INFO ( " %s: n_vocab = %u \n " , __func__ , hparams . n_vocab ) ;
LLAMA_LOG_INFO ( " %s: n_merges = %u \n " , __func__ , ( int ) vocab . bpe_ranks . size ( ) ) ;
LLAMA_LOG_INFO ( " %s: n_ctx_train = %u \n " , __func__ , hparams . n_ctx_train ) ;
LLAMA_LOG_INFO ( " %s: n_embd = %u \n " , __func__ , hparams . n_embd ) ;
LLAMA_LOG_INFO ( " %s: n_head = %u \n " , __func__ , hparams . n_head ) ;
LLAMA_LOG_INFO ( " %s: n_head_kv = %u \n " , __func__ , hparams . n_head_kv ) ;
LLAMA_LOG_INFO ( " %s: n_layer = %u \n " , __func__ , hparams . n_layer ) ;
LLAMA_LOG_INFO ( " %s: n_rot = %u \n " , __func__ , hparams . n_rot ) ; // a.k.a. n_embd_head, n_head_dim
LLAMA_LOG_INFO ( " %s: n_gqa = %u \n " , __func__ , hparams . n_gqa ( ) ) ;
LLAMA_LOG_INFO ( " %s: f_norm_eps = %.1e \n " , __func__ , hparams . f_norm_eps ) ;
LLAMA_LOG_INFO ( " %s: f_norm_rms_eps = %.1e \n " , __func__ , hparams . f_norm_rms_eps ) ;
LLAMA_LOG_INFO ( " %s: n_ff = %u \n " , __func__ , hparams . n_ff ) ;
LLAMA_LOG_INFO ( " %s: freq_base_train = %.1f \n " , __func__ , hparams . rope_freq_base_train ) ;
LLAMA_LOG_INFO ( " %s: freq_scale_train = %g \n " , __func__ , hparams . rope_freq_scale_train ) ;
LLAMA_LOG_INFO ( " %s: model type = %s \n " , __func__ , llama_model_type_name ( model . type ) ) ;
LLAMA_LOG_INFO ( " %s: model ftype = %s \n " , __func__ , llama_model_ftype_name ( model . ftype ) . c_str ( ) ) ;
LLAMA_LOG_INFO ( " %s: model params = %.2f B \n " , __func__ , ml . n_elements * 1e-9 ) ;
2023-09-17 14:33:28 +02:00
if ( ml . n_bytes < GB ) {
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: model size = %.2f MiB (%.2f BPW) \n " , __func__ , ml . n_bytes / 1024.0 / 1024.0 , ml . n_bytes * 8.0 / ml . n_elements ) ;
2023-09-17 14:33:28 +02:00
} else {
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: model size = %.2f GiB (%.2f BPW) \n " , __func__ , ml . n_bytes / 1024.0 / 1024.0 / 1024.0 , ml . n_bytes * 8.0 / ml . n_elements ) ;
2023-09-17 14:33:28 +02:00
}
2023-08-23 22:08:04 +02:00
// general kv
LLAMA_LOG_INFO ( " %s: general.name = %s \n " , __func__ , model . name . c_str ( ) ) ;
// special tokens
if ( vocab . special_bos_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: BOS token = %d '%s' \n " , __func__ , vocab . special_bos_id , vocab . id_to_token [ vocab . special_bos_id ] . text . c_str ( ) ) ; }
if ( vocab . special_eos_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: EOS token = %d '%s' \n " , __func__ , vocab . special_eos_id , vocab . id_to_token [ vocab . special_eos_id ] . text . c_str ( ) ) ; }
if ( vocab . special_unk_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: UNK token = %d '%s' \n " , __func__ , vocab . special_unk_id , vocab . id_to_token [ vocab . special_unk_id ] . text . c_str ( ) ) ; }
if ( vocab . special_sep_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: SEP token = %d '%s' \n " , __func__ , vocab . special_sep_id , vocab . id_to_token [ vocab . special_sep_id ] . text . c_str ( ) ) ; }
if ( vocab . special_pad_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: PAD token = %d '%s' \n " , __func__ , vocab . special_pad_id , vocab . id_to_token [ vocab . special_pad_id ] . text . c_str ( ) ) ; }
if ( vocab . linefeed_id ! = - 1 ) { LLAMA_LOG_INFO ( " %s: LF token = %d '%s' \n " , __func__ , vocab . linefeed_id , vocab . id_to_token [ vocab . linefeed_id ] . text . c_str ( ) ) ; }
}
static void llm_load_tensors (
llama_model_loader & ml ,
llama_model & model ,
int n_gpu_layers ,
int main_gpu ,
const float * tensor_split ,
bool use_mlock ,
llama_progress_callback progress_callback ,
void * progress_callback_user_data ) {
model . t_start_us = ggml_time_us ( ) ;
auto & ctx = model . ctx ;
auto & hparams = model . hparams ;
model . n_gpu_layers = n_gpu_layers ;
2023-03-22 06:32:36 +01:00
2023-05-13 10:23:15 +02:00
size_t ctx_size ;
size_t mmapped_size ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
ml . calc_sizes ( ctx_size , mmapped_size ) ;
2023-08-21 22:07:43 +02:00
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: ggml ctx size = %7.2f MB \n " , __func__ , ctx_size / 1024.0 / 1024.0 ) ;
2023-03-24 22:17:37 +01:00
2023-03-22 06:32:36 +01:00
// create the ggml context
{
2023-06-24 10:47:58 +02:00
model . buf . resize ( ctx_size ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
if ( use_mlock ) {
2023-08-21 22:07:43 +02:00
model . mlock_buf . init ( model . buf . data ) ;
2023-06-24 10:47:58 +02:00
model . mlock_buf . grow_to ( model . buf . size ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-03-24 22:17:37 +01:00
2023-03-22 06:32:36 +01:00
struct ggml_init_params params = {
2023-06-24 10:47:58 +02:00
/*.mem_size =*/ model . buf . size ,
2023-08-21 22:07:43 +02:00
/*.mem_buffer =*/ model . buf . data ,
2023-08-23 22:08:04 +02:00
/*.no_alloc =*/ ml . use_mmap ,
2023-03-22 06:32:36 +01:00
} ;
model . ctx = ggml_init ( params ) ;
if ( ! model . ctx ) {
2023-06-05 22:24:29 +02:00
throw std : : runtime_error ( format ( " ggml_init() failed " ) ) ;
2023-03-22 06:32:36 +01:00
}
}
2023-06-06 21:41:53 +02:00
( void ) main_gpu ;
2023-09-28 21:42:38 +02:00
# ifdef GGML_USE_CUBLAS
2023-08-25 11:09:42 +02:00
LLAMA_LOG_INFO ( " %s: using " GGML_CUDA_NAME " for GPU acceleration \n " , __func__ ) ;
2023-06-06 21:33:23 +02:00
ggml_cuda_set_main_device ( main_gpu ) ;
2023-06-06 21:41:53 +02:00
# define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2023-06-06 21:33:23 +02:00
# define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2023-06-04 08:12:05 +02:00
# elif defined(GGML_USE_CLBLAST)
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: using OpenCL for GPU acceleration \n " , __func__ ) ;
2023-06-06 21:41:53 +02:00
# define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2023-06-06 21:33:23 +02:00
# define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
# else
2023-06-06 21:41:53 +02:00
# define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2023-06-06 21:33:23 +02:00
# define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
# endif
2023-03-22 06:32:36 +01:00
// prepare memory for the weights
2023-06-06 21:33:23 +02:00
size_t vram_weights = 0 ;
2023-03-22 06:32:36 +01:00
{
2023-08-23 22:08:04 +02:00
const int64_t n_embd = hparams . n_embd ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
const int64_t n_layer = hparams . n_layer ;
const int64_t n_vocab = hparams . n_vocab ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
const auto tn = LLM_TN ( model . arch ) ;
switch ( model . arch ) {
case LLM_ARCH_LLAMA :
2023-10-04 15:23:39 +02:00
case LLM_ARCH_REFACT :
2023-08-23 22:08:04 +02:00
{
model . tok_embeddings = ml . create_tensor ( ctx , tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) , { n_embd , n_vocab } , GGML_BACKEND_CPU ) ;
// output
{
ggml_backend backend_norm ;
ggml_backend backend_output ;
if ( n_gpu_layers > int ( n_layer ) ) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
2023-06-14 19:47:19 +02:00
# ifndef _WIN32
2023-09-28 21:42:38 +02:00
backend_norm = LLAMA_BACKEND_OFFLOAD ;
2023-06-14 19:47:19 +02:00
# else
2023-09-28 21:42:38 +02:00
backend_norm = n_gpu_layers < = ( int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2023-06-14 19:47:19 +02:00
# endif // _WIN32
2023-08-23 22:08:04 +02:00
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
} else {
backend_norm = GGML_BACKEND_CPU ;
backend_output = GGML_BACKEND_CPU ;
}
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2023-08-23 22:08:04 +02:00
model . output_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " weight " ) , { n_embd } , backend_norm ) ;
model . output = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT , " weight " ) , { n_embd , n_vocab } , backend_output ) ;
if ( backend_norm = = GGML_BACKEND_GPU ) {
vram_weights + = ggml_nbytes ( model . output_norm ) ;
}
if ( backend_output = = GGML_BACKEND_GPU_SPLIT ) {
vram_weights + = ggml_nbytes ( model . output ) ;
}
}
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2023-08-23 22:08:04 +02:00
const uint32_t n_ff = hparams . n_ff ;
2023-08-21 22:07:43 +02:00
2023-08-23 22:08:04 +02:00
const int i_gpu_start = n_layer - n_gpu_layers ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
model . layers . resize ( n_layer ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2023-09-14 18:32:10 +02:00
for ( uint32_t i = 0 ; i < n_layer ; + + i ) {
const ggml_backend backend = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
const ggml_backend backend_split = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
auto & layer = model . layers [ i ] ;
layer . attn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . wq = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_Q , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
layer . wk = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_K , " weight " , i ) , { n_embd , n_embd_gqa } , backend_split ) ;
layer . wv = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_V , " weight " , i ) , { n_embd , n_embd_gqa } , backend_split ) ;
layer . wo = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_OUT , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
layer . ffn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . w1 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_GATE , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
layer . w2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_DOWN , " weight " , i ) , { n_ff , n_embd } , backend_split ) ;
layer . w3 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_UP , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
if ( backend = = GGML_BACKEND_GPU ) {
vram_weights + =
ggml_nbytes ( layer . attn_norm ) + ggml_nbytes ( layer . wq ) + ggml_nbytes ( layer . wk ) +
ggml_nbytes ( layer . wv ) + ggml_nbytes ( layer . wo ) + ggml_nbytes ( layer . ffn_norm ) +
ggml_nbytes ( layer . w1 ) + ggml_nbytes ( layer . w2 ) + ggml_nbytes ( layer . w3 ) ;
}
}
} break ;
case LLM_ARCH_BAICHUAN :
{
model . tok_embeddings = ml . create_tensor ( ctx , tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) , { n_embd , n_vocab } , GGML_BACKEND_CPU ) ;
{
ggml_backend backend_norm ;
ggml_backend backend_output ;
if ( n_gpu_layers > int ( n_layer ) ) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
# ifndef _WIN32
2023-09-28 21:42:38 +02:00
backend_norm = LLAMA_BACKEND_OFFLOAD ;
2023-09-14 18:32:10 +02:00
# else
2023-09-28 21:42:38 +02:00
backend_norm = n_gpu_layers < = ( int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2023-09-14 18:32:10 +02:00
# endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
} else {
backend_norm = GGML_BACKEND_CPU ;
backend_output = GGML_BACKEND_CPU ;
}
model . output_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " weight " ) , { n_embd } , backend_norm ) ;
model . output = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT , " weight " ) , { n_embd , n_vocab } , backend_output ) ;
if ( backend_norm = = GGML_BACKEND_GPU ) {
vram_weights + = ggml_nbytes ( model . output_norm ) ;
}
if ( backend_output = = GGML_BACKEND_GPU_SPLIT ) {
vram_weights + = ggml_nbytes ( model . output ) ;
}
}
const uint32_t n_ff = hparams . n_ff ;
const int i_gpu_start = n_layer - n_gpu_layers ;
model . layers . resize ( n_layer ) ;
2023-08-23 22:08:04 +02:00
for ( uint32_t i = 0 ; i < n_layer ; + + i ) {
const ggml_backend backend = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
const ggml_backend backend_split = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
auto & layer = model . layers [ i ] ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
layer . attn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " weight " , i ) , { n_embd } , backend ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
layer . wq = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_Q , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
layer . wk = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_K , " weight " , i ) , { n_embd , n_embd_gqa } , backend_split ) ;
layer . wv = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_V , " weight " , i ) , { n_embd , n_embd_gqa } , backend_split ) ;
layer . wo = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_OUT , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2023-08-23 22:08:04 +02:00
layer . ffn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . w1 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_GATE , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
layer . w2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_DOWN , " weight " , i ) , { n_ff , n_embd } , backend_split ) ;
layer . w3 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_UP , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
if ( backend = = GGML_BACKEND_GPU ) {
vram_weights + =
ggml_nbytes ( layer . attn_norm ) + ggml_nbytes ( layer . wq ) + ggml_nbytes ( layer . wk ) +
ggml_nbytes ( layer . wv ) + ggml_nbytes ( layer . wo ) + ggml_nbytes ( layer . ffn_norm ) +
ggml_nbytes ( layer . w1 ) + ggml_nbytes ( layer . w2 ) + ggml_nbytes ( layer . w3 ) ;
}
}
} break ;
case LLM_ARCH_FALCON :
{
// TODO: CPU-only for now
model . tok_embeddings = ml . create_tensor ( ctx , tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) , { n_embd , n_vocab } , GGML_BACKEND_CPU ) ;
// output
{
ggml_backend backend_norm ;
ggml_backend backend_output ;
if ( n_gpu_layers > int ( n_layer ) ) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
# ifndef _WIN32
2023-09-28 21:42:38 +02:00
backend_norm = LLAMA_BACKEND_OFFLOAD ;
2023-08-23 22:08:04 +02:00
# else
2023-09-28 21:42:38 +02:00
backend_norm = n_gpu_layers < = ( int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2023-08-23 22:08:04 +02:00
# endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
} else {
backend_norm = GGML_BACKEND_CPU ;
backend_output = GGML_BACKEND_CPU ;
}
model . output_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " weight " ) , { n_embd } , backend_norm ) ;
model . output_norm_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " bias " ) , { n_embd } , backend_norm ) ;
model . output = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT , " weight " ) , { n_embd , n_vocab } , backend_output ) ;
2023-08-25 10:55:59 +02:00
if ( backend_norm = = GGML_BACKEND_GPU ) {
vram_weights + = ggml_nbytes ( model . output_norm ) ;
vram_weights + = ggml_nbytes ( model . output_norm_b ) ;
}
if ( backend_output = = GGML_BACKEND_GPU_SPLIT ) {
vram_weights + = ggml_nbytes ( model . output ) ;
}
2023-08-23 22:08:04 +02:00
}
const uint32_t n_ff = hparams . n_ff ;
const int i_gpu_start = n_layer - n_gpu_layers ;
model . layers . resize ( n_layer ) ;
for ( uint32_t i = 0 ; i < n_layer ; + + i ) {
2023-08-25 10:55:59 +02:00
const ggml_backend backend = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2023-08-23 22:08:04 +02:00
const ggml_backend backend_split = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
auto & layer = model . layers [ i ] ;
layer . attn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . attn_norm_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " bias " , i ) , { n_embd } , backend ) ;
if ( gguf_find_tensor ( ml . ctx_gguf , tn ( LLM_TENSOR_ATTN_NORM_2 , " weight " , i ) . c_str ( ) ) > = 0 ) {
layer . attn_norm_2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM_2 , " weight " , i ) , { n_embd } , backend ) ;
layer . attn_norm_2_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM_2 , " bias " , i ) , { n_embd } , backend ) ;
2023-08-25 10:55:59 +02:00
if ( backend = = GGML_BACKEND_GPU ) {
vram_weights + = ggml_nbytes ( layer . attn_norm_2 ) ;
vram_weights + = ggml_nbytes ( layer . attn_norm_2_b ) ;
}
2023-08-23 22:08:04 +02:00
}
layer . wqkv = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_QKV , " weight " , i ) , { n_embd , n_embd + 2 * n_embd_gqa } , backend_split ) ;
layer . wo = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_OUT , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
layer . w2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_DOWN , " weight " , i ) , { n_ff , n_embd } , backend_split ) ;
layer . w3 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_UP , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
2023-08-25 10:55:59 +02:00
if ( backend = = GGML_BACKEND_GPU ) {
vram_weights + =
ggml_nbytes ( layer . attn_norm ) + ggml_nbytes ( layer . attn_norm_b ) +
ggml_nbytes ( layer . wqkv ) + ggml_nbytes ( layer . wo ) +
ggml_nbytes ( layer . w2 ) + ggml_nbytes ( layer . w3 ) ;
}
2023-08-23 22:08:04 +02:00
}
} break ;
2023-09-15 21:02:13 +02:00
case LLM_ARCH_STARCODER :
{
model . tok_embeddings = ml . create_tensor ( ctx , tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) , { n_embd , n_vocab } , GGML_BACKEND_CPU ) ;
model . pos_embeddings = ml . create_tensor ( ctx , tn ( LLM_TENSOR_POS_EMBD , " weight " ) , { n_embd , hparams . n_ctx_train } , GGML_BACKEND_CPU ) ;
// output
{
ggml_backend backend_norm ;
ggml_backend backend_output ;
if ( n_gpu_layers > int ( n_layer ) ) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
# ifndef _WIN32
2023-09-28 21:42:38 +02:00
backend_norm = LLAMA_BACKEND_OFFLOAD ;
2023-09-15 21:02:13 +02:00
# else
2023-09-28 21:42:38 +02:00
backend_norm = n_gpu_layers < = ( int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2023-09-15 21:02:13 +02:00
# endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
} else {
backend_norm = GGML_BACKEND_CPU ;
backend_output = GGML_BACKEND_CPU ;
}
model . output_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " weight " ) , { n_embd } , backend_norm ) ;
model . output_norm_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT_NORM , " bias " ) , { n_embd } , backend_norm ) ;
model . output = ml . create_tensor ( ctx , tn ( LLM_TENSOR_OUTPUT , " weight " ) , { n_embd , n_vocab } , backend_output ) ;
if ( backend_norm = = GGML_BACKEND_GPU ) {
vram_weights + = ggml_nbytes ( model . output_norm ) ;
vram_weights + = ggml_nbytes ( model . output_norm_b ) ;
}
if ( backend_output = = GGML_BACKEND_GPU_SPLIT ) {
vram_weights + = ggml_nbytes ( model . output ) ;
}
}
const uint32_t n_ff = hparams . n_ff ;
const int i_gpu_start = n_layer - n_gpu_layers ;
model . layers . resize ( n_layer ) ;
for ( uint32_t i = 0 ; i < n_layer ; + + i ) {
const ggml_backend backend = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
const ggml_backend backend_split = int ( i ) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
auto & layer = model . layers [ i ] ;
layer . attn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . attn_norm_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_NORM , " bias " , i ) , { n_embd } , backend ) ;
layer . wqkv = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_QKV , " weight " , i ) , { n_embd , n_embd + 2 * n_embd_gqa } , backend_split ) ;
layer . bqkv = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_QKV , " bias " , i ) , { n_embd + 2 * n_embd_gqa } , backend_split ) ;
layer . wo = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_OUT , " weight " , i ) , { n_embd , n_embd } , backend_split ) ;
layer . bo = ml . create_tensor ( ctx , tn ( LLM_TENSOR_ATTN_OUT , " bias " , i ) , { n_embd } , backend_split ) ;
layer . ffn_norm = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_NORM , " weight " , i ) , { n_embd } , backend ) ;
layer . ffn_norm_b = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_NORM , " bias " , i ) , { n_embd } , backend ) ;
layer . w2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_DOWN , " weight " , i ) , { n_ff , n_embd } , backend_split ) ;
layer . b2 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_DOWN , " bias " , i ) , { n_embd } , backend_split ) ;
layer . w3 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_UP , " weight " , i ) , { n_embd , n_ff } , backend_split ) ;
layer . b3 = ml . create_tensor ( ctx , tn ( LLM_TENSOR_FFN_UP , " bias " , i ) , { n_ff } , backend_split ) ;
if ( backend = = GGML_BACKEND_GPU ) {
vram_weights + =
ggml_nbytes ( layer . attn_norm ) + ggml_nbytes ( layer . attn_norm_b ) +
ggml_nbytes ( layer . wqkv ) + ggml_nbytes ( layer . bqkv ) +
ggml_nbytes ( layer . wo ) + ggml_nbytes ( layer . bo ) +
ggml_nbytes ( layer . ffn_norm ) + ggml_nbytes ( layer . ffn_norm_b ) +
ggml_nbytes ( layer . w2 ) + ggml_nbytes ( layer . b2 ) +
ggml_nbytes ( layer . w3 ) + ggml_nbytes ( layer . b3 ) ;
}
}
} break ;
2023-08-23 22:08:04 +02:00
default :
throw std : : runtime_error ( " unknown architecture " ) ;
2023-09-28 23:41:44 +02:00
}
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
ml . done_getting_tensors ( ) ;
2023-03-22 06:32:36 +01:00
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
// print memory requirements
{
// this is the total memory required to run the inference
2023-07-30 15:58:01 +02:00
size_t mem_required =
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
ctx_size +
2023-07-30 15:58:01 +02:00
mmapped_size - vram_weights ; // weights in VRAM not in memory
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: mem required = %7.2f MB \n " , __func__ , mem_required / 1024.0 / 1024.0 ) ;
2023-07-05 08:58:05 +02:00
2023-06-04 08:12:05 +02:00
# if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2023-06-06 21:41:53 +02:00
const int n_gpu = std : : min ( n_gpu_layers , int ( hparams . n_layer ) ) ;
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: offloading %d repeating layers to GPU \n " , __func__ , n_gpu ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
if ( n_gpu_layers > ( int ) hparams . n_layer ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: offloading non-repeating layers to GPU \n " , __func__ ) ;
2023-06-14 19:47:19 +02:00
}
2023-07-05 08:58:05 +02:00
# ifdef GGML_USE_CUBLAS
const int max_backend_supported_layers = hparams . n_layer + 3 ;
2023-09-28 21:42:38 +02:00
const int max_offloadable_layers = hparams . n_layer + 3 ;
2023-07-05 08:58:05 +02:00
# elif defined(GGML_USE_CLBLAST)
const int max_backend_supported_layers = hparams . n_layer + 1 ;
const int max_offloadable_layers = hparams . n_layer + 1 ;
# endif // GGML_USE_CUBLAS
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: offloaded %d/%d layers to GPU \n " , __func__ , std : : min ( n_gpu_layers , max_offloadable_layers ) , max_backend_supported_layers ) ;
LLAMA_LOG_INFO ( " %s: VRAM used: %.2f MB \n " , __func__ , vram_weights / 1024.0 / 1024.0 ) ;
2023-06-04 08:12:05 +02:00
# else
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
( void ) n_gpu_layers ;
2023-07-05 08:58:05 +02:00
# endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
}
2023-05-13 15:38:36 +02:00
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
// populate `tensors_by_name`
2023-08-23 22:08:04 +02:00
for ( int i = 0 ; i < ml . n_tensors ; + + i ) {
struct ggml_tensor * cur = ggml_get_tensor ( ctx , ml . get_tensor_name ( i ) ) ;
2023-08-21 22:07:43 +02:00
model . tensors_by_name . emplace_back ( ggml_get_name ( cur ) , cur ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
}
2023-05-13 15:38:36 +02:00
train : improved training-from-scratch example (#1652)
* add python wrapper
https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce
* fix decoding error. adds errors=ignore parameter
* add python bindings for functions to get and set the whole llama state
(rng, logits, embedding and kv_cache)
* update python bindings
* add text generating baby-llama from scratch example
* fix race condition bug in ggml_compute_forward_diag_mask_f32
* implement ggml_soft_max_back for more performant backward pass of soft_max
avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss
* improve softmax backward pass
go from quadratic runtime to linear runtime by simplifying the formulas
* fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32
memcpy needs to be synchronized across threads to avoid race conditions.
=> do it in INIT phase
* fix bug in ggml_compute_forward_soft_max_back_f32 on DEBUG build
* improve performance of mul_mat backward pass
avoid transpose by using mul_mat with swapped arguments
* avoid printing too much newlines in baby-llama-text
* activate threading in baby-llama-text
* add ggml_out_prod and use it for mul_mat backward pass for improved performance
performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests
* better weight initialization improves training convergence at start
* better weight initialization improves training convergence at start
* improve ggml_out_prod performance
- change iteration order (>15s -> 10s runtime)
- parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime)
* add llama sampler, shuffle samples and constrain sampling to tokens occurring in train data
* fix get_samples call, add model tensor names, increase model size, start training samples after newline
* save train trained model to checkpoint and load model to be trained from checkpoint
* use inplace functions where possible
* initialize rng with srand
* use different arguments for input and output checkpoint
* ggml fixes to support backward pass on inplace operations
* remove duplicate include
* fix cross entropy loss
- add target probabilities for each sample which is then used in cross entropy loss
* print used memory before and after optimization
* sample with non-greedy sampling parameters at the end of training
* add cmake target for baby-llama-text
* add ggml_add1_inplace to header
* enable gradient propagation for inplace add1 and scale operations
those functions backward passes don't need the original src0, so they also work when forward is inplace
* implement AdamW in ggml_opt_adam by adding weight decay parameter (default 0.001f)
also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule.
setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer.
since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer.
* use inplace operations in cross_entropy_loss
* fix random weight initialization scale
* add missing default parameters for adam optimizer
* add ggml_opt_context, so that we can properly resume training
otherwise the optimizer states, tracking statistics about the error function and its derivates,
will reset to zero each time ggml_opt is called, hindering convergence on resumed training.
now the optimizer context and all its memory is stored in a separate struct.
* fix bug in llama_sample_token_mirostat_v2
when all candidates are filtered out through mu threshold, the following soft_max operation will fail.
so keep at least one.
* add forward function without using cache, for more performant training
during training on whole samples no cache is required.
removing the cache and simplifying the remaining code results in performance and memory usage improvement.
* print suppressed newline tokens as string "\n"
printing too much actual newlines is suppressed to avoid flooding the console.
* store optimizer state in training checkpoint and add learning schedule
persistent optimizer state allows to resume training without resetting the optimizer
learning schedule consists of linear warmup ramp followed by cosine decay with restarts
* remove unused functions
* fix bug in get_samples which corrupted training targets
* save checkpoint only when it was trained
* simplify code
* remove trailing whitespace
* simplify backward pass for SQRT
* replace inefficient repeat backward pass with dedicated repeat_back operation
* add ggml_cross_entropy_loss with backward pass for faster training
cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead.
* add tests for cross_entropy_loss backward pass
finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient.
_probably_ the finite differences fails due to numerical issues
* use ggml_cross_entropy_loss in text training example
* remove trailing whitespace
* slightly improve how cross entropy loss is compute
btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log.
probably the input to log gets closer to zero due to float numerics.
maybe the multiplication by (1.0-eps)/sum is more accurate..
* add llama_get_vocab to get the vocabulary as output parameters
* set default model.type for unknown models with few layers
* add export of training checkpoint to llama compatible model file
* get vocabulary for exporting training checkpoint to llama compatible model file
* implement backward pass of flash attention
* bugfixes for backward pass of flash attention
* test flash attention backward pass
need to set loose error bounds to pass.
the finitie differences are close to numeric limits and often return quite different values than the backward pass.
reducing eps further lets the gradients vanish completely.
likewise setting eps to big results in wronger values.
the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences.
* add option to train with flash attention and move options to the top of the main function
training from scratch also works with flash attention
training convergence and generation results after fix number of iterations are worse than when not using flash attention.
maybe there still lingers a bug in the flash attention backward pass?
but training works, just with slower convergence.
flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx
* add train_params and command line option parser
* remove unnecessary comments
* add train params to specify memory size
* remove python bindings
* rename baby-llama-text to train-text-from-scratch
* replace auto parameters in lambda function
* add #include <climits>
* add explicit cast to fix compile error
"error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]"
* remove trailing whitespace
* add ggml_opt_resume_g which accepts forward and backward cgraphs
* fix formulas in comments
* bug fix for ggml_compute_forward_get_rows_back_f32
the result should be set to zero, not to whatever data is in opt0
* improve training memory usage with scratch buffers
instead of relying on the automatic backward pass, we manually create the graph for the backward pass.
it turns out that all backward pass operations need only temporary memory which can be reused after each layer.
will compute backward pass for ALL model parameters
* add option to use scratch buffers in training or not
make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters.
* ci : disable temporary
* store view offset and permute axes in opt[0] instead of storing it in padding
use memcpy to store offset, because offset is of type size_t.
when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true.
* minor : fix compile warnings + minor style changes
* fix bug in threaded indices calculation of ggml_compute_forward_flash_attn_back_f32
* store view offset like in master branch
* bug fix in forward_batch_wo_cache_flash_attn_train
* scratch buffer bug fixes in forward_batch_wo_cache_flash_attn_train
data of permute and reshape is the same as their input.
if we want to preserve the output of permute/reshape, we also need to preserve their inputs.
replace reshape(src0, src1) with reshape_nd calls so that we don't need src1.
replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02).
in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls.
for this we need backward pass of broadcasting ggml_mul.
* remove unnecessary scratch buffer 0
buf 0 is persistent memory, so we can just disable scratch for this by using buf -1
* avoid creating unnecessary grad tensors
previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
this wasted memory, because unnecessary grad for each op were automatically created:
the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
this discarded the automatically generated grad resulting in wasted memory.
improved this by changing expand(..) to not use ggml_build_forward_expand.
expand set cgraph->nodes but not the leafs.
cgraph->leafs & cgraph->grads are set in another pass after the last expand call.
* print used training seed
* zero initialize gfbuf and gbbuf
* ci : re-enable workflows + add README for training
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-13 21:04:40 +02:00
( void ) tensor_split ;
2023-09-28 21:42:38 +02:00
# ifdef GGML_USE_CUBLAS
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
{
2023-06-06 21:33:23 +02:00
ggml_cuda_set_tensor_split ( tensor_split ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
}
2023-05-22 23:33:24 +02:00
# endif
2023-05-13 15:38:36 +02:00
2023-08-23 22:08:04 +02:00
ml . load_all_data ( ctx , progress_callback , progress_callback_user_data , use_mlock ? & model . mlock_mmap : NULL ) ;
2023-06-12 14:44:16 +02:00
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
if ( progress_callback ) {
progress_callback ( 1.0f , progress_callback_user_data ) ;
2023-05-13 15:38:36 +02:00
}
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2023-08-23 22:08:04 +02:00
model . mapping = std : : move ( ml . mapping ) ;
2023-03-22 06:32:36 +01:00
2023-03-29 22:51:37 +02:00
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
2023-06-24 10:47:58 +02:00
model . t_load_us = ggml_time_us ( ) - model . t_start_us ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-03-22 06:32:36 +01:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
static bool llama_model_load (
const std : : string & fname ,
2023-06-24 10:47:58 +02:00
llama_model & model ,
2023-05-13 15:38:36 +02:00
int n_gpu_layers ,
2023-06-06 21:33:23 +02:00
int main_gpu ,
2023-07-21 12:10:51 +02:00
const float * tensor_split ,
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
bool use_mmap ,
bool use_mlock ,
bool vocab_only ,
llama_progress_callback progress_callback ,
void * progress_callback_user_data ) {
try {
2023-09-28 21:42:38 +02:00
llama_model_loader ml ( fname , use_mmap ) ;
2023-08-23 22:08:04 +02:00
2023-09-28 21:42:38 +02:00
model . hparams . vocab_only = vocab_only ;
2023-08-23 22:08:04 +02:00
2023-09-28 21:42:38 +02:00
llm_load_arch ( ml , model ) ;
llm_load_hparams ( ml , model ) ;
llm_load_vocab ( ml , model ) ;
llm_load_print_meta ( ml , model ) ;
2023-08-23 22:08:04 +02:00
if ( model . hparams . n_vocab ! = model . vocab . id_to_token . size ( ) ) {
throw std : : runtime_error ( " vocab size mismatch " ) ;
}
if ( vocab_only ) {
LLAMA_LOG_INFO ( " %s: vocab only - skipping tensors \n " , __func__ ) ;
return true ;
}
llm_load_tensors (
2023-09-28 21:42:38 +02:00
ml , model , n_gpu_layers ,
main_gpu , tensor_split ,
2023-08-23 22:08:04 +02:00
use_mlock , progress_callback , progress_callback_user_data ) ;
2023-06-05 22:24:29 +02:00
} catch ( const std : : exception & err ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " error loading model: %s \n " , err . what ( ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
return false ;
2023-03-25 06:26:28 +01:00
}
2023-08-23 22:08:04 +02:00
return true ;
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
static struct ggml_cgraph * llm_build_llama (
2023-06-28 17:53:37 +02:00
llama_context & lctx ,
2023-09-28 18:04:36 +02:00
const llama_batch & batch ) {
2023-03-22 06:32:36 +01:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = lctx . cparams ;
2023-03-22 06:32:36 +01:00
2023-06-24 10:47:58 +02:00
const auto & kv_self = lctx . kv_self ;
2023-03-24 22:17:37 +01:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ! ! kv_self . ctx ) ;
2023-03-24 22:17:37 +01:00
2023-07-23 14:09:47 +02:00
const int64_t n_embd = hparams . n_embd ;
const int64_t n_layer = hparams . n_layer ;
2023-09-28 21:42:38 +02:00
const int64_t n_ctx = cparams . n_ctx ;
2023-07-23 14:09:47 +02:00
const int64_t n_head = hparams . n_head ;
const int64_t n_head_kv = hparams . n_head_kv ;
const int64_t n_embd_head = hparams . n_embd_head ( ) ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 21:42:38 +02:00
const float freq_base = cparams . rope_freq_base ;
const float freq_scale = cparams . rope_freq_scale ;
2023-08-21 22:07:43 +02:00
const float norm_rms_eps = hparams . f_norm_rms_eps ;
llama : add custom RoPE (#2054)
* Implement customizable RoPE
The original RoPE has pre-defined parameters
theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]
Our customizable RoPE, ggml_rope_custom_inplace, uses
theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]
with the default matches the original
scale = 1.0
base = 10000
The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.
Recent researches show changing these two parameters extends the context limit with minimal loss.
1. Extending Context to 8K
kaiokendev
https://kaiokendev.github.io/til#extending-context-to-8k
2. Extending Context Window of Large Language Models via Positional Interpolation
Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
https://arxiv.org/abs/2306.15595
3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
https://www.reddit.com/user/bloc97
https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
* ggml-metal: fix custom rope
* common: fix argument names in help
* llama: increase MEM_REQ_EVAL for MODEL_3B
It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.
* llama: make MEM_REQ_EVAL depend on n_ctx
* server: use proper Content-Type in curl examples
Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded
Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
With Content-Type: application/json, we can send large json data.
* style : minor fixes, mostly indentations
* ggml : fix asserts
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-15 12:34:16 +02:00
2023-07-23 14:09:47 +02:00
const int n_gpu_layers = model . n_gpu_layers ;
2023-09-28 18:04:36 +02:00
const int32_t n_tokens = batch . n_tokens ;
const int32_t n_kv = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx : kv_self . n ;
const int32_t kv_head = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx - n_tokens : kv_self . head ;
const bool do_rope_shift = ggml_allocr_is_measure ( lctx . alloc ) | | kv_self . has_shift ;
//printf("n_kv = %d\n", n_kv);
2023-08-23 22:08:04 +02:00
auto & buf_compute = lctx . buf_compute ;
2023-03-22 06:32:36 +01:00
struct ggml_init_params params = {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
/*.mem_size =*/ buf_compute . size ,
2023-08-21 22:07:43 +02:00
/*.mem_buffer =*/ buf_compute . data ,
2023-03-29 02:03:43 +02:00
/*.no_alloc =*/ false ,
2023-03-22 06:32:36 +01:00
} ;
2023-07-30 15:58:01 +02:00
params . no_alloc = true ;
2023-03-22 06:32:36 +01:00
struct ggml_context * ctx0 = ggml_init ( params ) ;
2023-03-25 16:03:10 +01:00
2023-07-26 15:56:53 +02:00
ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
2023-07-07 18:24:01 +02:00
2023-06-04 22:34:30 +02:00
struct ggml_tensor * cur ;
2023-06-28 17:53:37 +02:00
struct ggml_tensor * inpL ;
2023-09-28 18:04:36 +02:00
if ( batch . token ) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
2023-07-30 15:58:01 +02:00
ggml_allocr_alloc ( lctx . alloc , inp_tokens ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inp_tokens - > data , batch . token , n_tokens * ggml_element_size ( inp_tokens ) ) ;
2023-07-30 15:58:01 +02:00
}
2023-07-10 17:49:56 +02:00
ggml_set_name ( inp_tokens , " inp_tokens " ) ;
inpL = ggml_get_rows ( ctx0 , model . tok_embeddings , inp_tokens ) ;
2023-06-28 17:53:37 +02:00
} else {
2023-07-10 17:49:56 +02:00
# ifdef GGML_USE_MPI
GGML_ASSERT ( false & & " not implemented " ) ;
# endif
2023-09-28 18:04:36 +02:00
inpL = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_tokens ) ;
2023-07-30 15:58:01 +02:00
ggml_allocr_alloc ( lctx . alloc , inpL ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inpL - > data , batch . embd , n_tokens * n_embd * ggml_element_size ( inpL ) ) ;
2023-07-30 15:58:01 +02:00
}
2023-06-28 17:53:37 +02:00
}
2023-03-22 06:32:36 +01:00
2023-06-06 21:33:23 +02:00
const int i_gpu_start = n_layer - n_gpu_layers ;
2023-06-06 21:41:53 +02:00
( void ) i_gpu_start ;
2023-06-06 21:33:23 +02:00
2023-06-14 19:47:19 +02:00
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop ; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop ;
offload_func_t offload_func_v = llama_nop ;
# ifdef GGML_USE_CUBLAS
2023-07-10 17:49:56 +02:00
if ( n_gpu_layers > n_layer ) {
2023-08-22 15:25:19 +02:00
offload_func_nr = ggml_cuda_assign_buffers_no_alloc ;
2023-07-10 17:49:56 +02:00
}
if ( n_gpu_layers > n_layer + 1 ) {
2023-08-22 15:25:19 +02:00
offload_func_v = ggml_cuda_assign_buffers_no_alloc ;
2023-07-10 17:49:56 +02:00
}
if ( n_gpu_layers > n_layer + 2 ) {
2023-08-22 15:25:19 +02:00
offload_func_kq = ggml_cuda_assign_buffers_no_alloc ;
2023-07-10 17:49:56 +02:00
}
2023-06-14 19:47:19 +02:00
# endif // GGML_USE_CUBLAS
2023-09-28 18:04:36 +02:00
// KQ_scale
2023-07-30 15:58:01 +02:00
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , 1 ) ;
2023-09-28 18:04:36 +02:00
ggml_set_name ( KQ_scale , " 1/sqrt(n_embd_head) " ) ;
2023-07-30 15:58:01 +02:00
ggml_allocr_alloc ( lctx . alloc , KQ_scale ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
ggml_set_f32 ( KQ_scale , 1.0f / sqrtf ( float ( n_embd_head ) ) ) ;
}
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_kv , n_tokens , 1 ) ;
offload_func_kq ( KQ_mask ) ;
ggml_set_name ( KQ_mask , " KQ_mask " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_mask ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
float * data = ( float * ) KQ_mask - > data ;
memset ( data , 0 , ggml_nbytes ( KQ_mask ) ) ;
for ( int h = 0 ; h < 1 ; + + h ) {
for ( int j = 0 ; j < n_tokens ; + + j ) {
const llama_pos pos = batch . pos [ j ] ;
const llama_seq_id seq_id = batch . seq_id [ j ] ;
for ( int i = 0 ; i < n_kv ; + + i ) {
if ( ! kv_self . cells [ i ] . has_seq_id ( seq_id ) | | kv_self . cells [ i ] . pos > pos ) {
data [ h * ( n_kv * n_tokens ) + j * n_kv + i ] = - INFINITY ;
}
}
}
}
}
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
offload_func_kq ( KQ_pos ) ;
ggml_set_name ( KQ_pos , " KQ_pos " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_pos ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) KQ_pos - > data ;
for ( int i = 0 ; i < n_tokens ; + + i ) {
data [ i ] = batch . pos [ i ] ;
}
}
// shift the entire K-cache if needed
if ( do_rope_shift ) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_ctx ) ;
offload_func_kq ( K_shift ) ;
ggml_set_name ( K_shift , " K_shift " ) ;
ggml_allocr_alloc ( lctx . alloc , K_shift ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) K_shift - > data ;
for ( int i = 0 ; i < n_ctx ; + + i ) {
data [ i ] = kv_self . cells [ i ] . delta ;
}
}
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace ( ctx0 ,
ggml_view_3d ( ctx0 , kv_self . k ,
n_embd_head , n_head_kv , n_ctx ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ,
K_shift , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
offload_func_kq ( tmp ) ;
ggml_build_forward_expand ( gf , tmp ) ;
}
2023-07-30 15:58:01 +02:00
}
2023-03-22 06:32:36 +01:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2023-07-10 17:49:56 +02:00
ggml_format_name ( inpL , " layer_inp_%d " , il ) ;
2023-06-06 21:33:23 +02:00
offload_func_t offload_func = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( il > = i_gpu_start ) {
2023-08-22 15:25:19 +02:00
offload_func = ggml_cuda_assign_buffers_no_alloc ;
2023-06-06 21:33:23 +02:00
}
# endif // GGML_USE_CUBLAS
2023-03-22 06:32:36 +01:00
struct ggml_tensor * inpSA = inpL ;
// norm
{
2023-08-21 22:07:43 +02:00
cur = ggml_rms_norm ( ctx0 , inpL , norm_rms_eps ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_0 " ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
// cur = cur*attn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . attn_norm ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
ggml_set_name ( cur , " attention_norm_0 " ) ;
2023-03-22 06:32:36 +01:00
}
// self-attention
{
2023-04-05 21:07:33 +02:00
// compute Q and K and RoPE them
2023-06-06 21:33:23 +02:00
struct ggml_tensor * tmpk = ggml_mul_mat ( ctx0 , model . layers [ il ] . wk , cur ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( tmpk ) ;
2023-06-06 21:33:23 +02:00
ggml_set_name ( tmpk , " tmpk " ) ;
2023-06-14 19:47:19 +02:00
struct ggml_tensor * tmpq = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq , cur ) ;
offload_func_kq ( tmpq ) ;
ggml_set_name ( tmpq , " tmpq " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Kcur = ggml_rope_custom ( ctx0 , ggml_reshape_3d ( ctx0 , tmpk , n_embd_head , n_head_kv , n_tokens ) , KQ_pos , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( Kcur ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( Kcur , " Kcur " ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Qcur = ggml_rope_custom ( ctx0 , ggml_reshape_3d ( ctx0 , tmpq , n_embd_head , n_head , n_tokens ) , KQ_pos , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( Qcur ) ;
2023-06-06 21:33:23 +02:00
ggml_set_name ( Qcur , " Qcur " ) ;
2023-03-22 06:32:36 +01:00
// store key and value to memory
2023-04-05 21:07:33 +02:00
{
2023-09-28 18:04:36 +02:00
// compute the transposed [n_tokens, n_embd] V matrix
2023-06-14 19:47:19 +02:00
struct ggml_tensor * tmpv = ggml_mul_mat ( ctx0 , model . layers [ il ] . wv , cur ) ;
offload_func_v ( tmpv ) ;
ggml_set_name ( tmpv , " tmpv " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , tmpv , n_embd_gqa , n_tokens ) ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( Vcur ) ;
2023-06-04 22:34:30 +02:00
ggml_set_name ( Vcur , " Vcur " ) ;
2023-04-05 21:07:33 +02:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , n_tokens * n_embd_gqa , ( ggml_element_size ( kv_self . k ) * n_embd_gqa ) * ( il * n_ctx + kv_head ) ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( k ) ;
2023-06-06 21:33:23 +02:00
ggml_set_name ( k , " k " ) ;
2023-06-14 19:47:19 +02:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , n_tokens , n_embd_gqa ,
2023-04-05 21:07:33 +02:00
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
2023-09-28 18:04:36 +02:00
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd_gqa + kv_head * ggml_element_size ( kv_self . v ) ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( v ) ;
2023-08-23 22:08:04 +02:00
ggml_set_name ( v , " v " ) ;
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
}
struct ggml_tensor * Q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
offload_func_kq ( Q ) ;
ggml_set_name ( Q , " Q " ) ;
struct ggml_tensor * K =
ggml_view_3d ( ctx0 , kv_self . k ,
2023-09-28 18:04:36 +02:00
n_embd_head , n_kv , n_head_kv ,
2023-08-23 22:08:04 +02:00
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ;
offload_func_kq ( K ) ;
ggml_set_name ( K , " K " ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
offload_func_kq ( KQ ) ;
ggml_set_name ( KQ , " KQ " ) ;
// KQ_scaled = KQ / sqrt(n_embd_head)
2023-09-28 18:04:36 +02:00
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale ( ctx0 , KQ , KQ_scale ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( KQ_scaled ) ;
ggml_set_name ( KQ_scaled , " KQ_scaled " ) ;
// KQ_masked = mask_past(KQ_scaled)
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_masked = ggml_add ( ctx0 , KQ_scaled , KQ_mask ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( KQ_masked ) ;
ggml_set_name ( KQ_masked , " KQ_masked " ) ;
// KQ = soft_max(KQ_masked)
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max ( ctx0 , KQ_masked ) ;
2023-08-23 22:08:04 +02:00
offload_func_v ( KQ_soft_max ) ;
ggml_set_name ( KQ_soft_max , " KQ_soft_max " ) ;
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
2023-09-28 18:04:36 +02:00
n_kv , n_embd_head , n_head_kv ,
2023-08-23 22:08:04 +02:00
ggml_element_size ( kv_self . v ) * n_ctx ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_head ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_gqa * il ) ;
offload_func_v ( V ) ;
ggml_set_name ( V , " V " ) ;
# if 1
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
offload_func_v ( KQV ) ;
ggml_set_name ( KQV , " KQV " ) ;
# else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way?
2023-09-28 18:04:36 +02:00
struct ggml_tensor * V_cont = ggml_cpy ( ctx0 , V , ggml_new_tensor_3d ( ctx0 , kv_self . v - > type , n_ctx , n_embd_head , n_head ) ) ;
2023-08-23 22:08:04 +02:00
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V_cont , KQ_soft_max ) ;
# endif
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
offload_func_v ( KQV_merged ) ;
ggml_set_name ( KQV_merged , " KQV_merged " ) ;
2023-09-28 18:04:36 +02:00
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d ( ctx0 , KQV_merged , n_embd , n_tokens ) ;
2023-08-23 22:08:04 +02:00
offload_func_v ( cur ) ;
ggml_set_name ( cur , " KQV_merged_contiguous " ) ;
// projection (no bias)
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . wo ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_wo " ) ;
}
struct ggml_tensor * inpFF = ggml_add ( ctx0 , cur , inpSA ) ;
offload_func ( inpFF ) ;
ggml_set_name ( inpFF , " inpFF " ) ;
// feed-forward network
{
// norm
{
cur = ggml_rms_norm ( ctx0 , inpFF , norm_rms_eps ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_1 " ) ;
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . ffn_norm ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " ffn_norm " ) ;
}
struct ggml_tensor * tmp = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w3 ,
cur ) ;
offload_func ( tmp ) ;
ggml_set_name ( tmp , " result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w1 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w1 " ) ;
// SILU activation
cur = ggml_silu ( ctx0 , cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu " ) ;
cur = ggml_mul ( ctx0 , cur , tmp ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu_x_result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w2 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w2 " ) ;
}
cur = ggml_add ( ctx0 , cur , inpFF ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " inpFF_+_result_w2 " ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
// norm
{
cur = ggml_rms_norm ( ctx0 , cur , norm_rms_eps ) ;
offload_func_nr ( cur ) ;
ggml_set_name ( cur , " rms_norm_2 " ) ;
// cur = cur*norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . output_norm ) ;
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name ( cur , " result_norm " ) ;
}
// lm_head
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
ggml_set_name ( cur , " result_output " ) ;
ggml_build_forward_expand ( gf , cur ) ;
ggml_free ( ctx0 ) ;
return gf ;
}
2023-09-14 18:32:10 +02:00
static struct ggml_cgraph * llm_build_baichaun (
llama_context & lctx ,
2023-09-28 18:04:36 +02:00
const llama_batch & batch ) {
2023-09-14 18:32:10 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = lctx . cparams ;
2023-09-14 18:32:10 +02:00
const auto & kv_self = lctx . kv_self ;
GGML_ASSERT ( ! ! kv_self . ctx ) ;
const int64_t n_embd = hparams . n_embd ;
const int64_t n_layer = hparams . n_layer ;
2023-09-28 21:42:38 +02:00
const int64_t n_ctx = cparams . n_ctx ;
2023-09-14 18:32:10 +02:00
const int64_t n_head = hparams . n_head ;
const int64_t n_head_kv = hparams . n_head_kv ;
const int64_t n_embd_head = hparams . n_embd_head ( ) ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2023-09-28 21:42:38 +02:00
const float freq_base = cparams . rope_freq_base ;
const float freq_scale = cparams . rope_freq_scale ;
2023-09-14 18:32:10 +02:00
const float norm_rms_eps = hparams . f_norm_rms_eps ;
const int n_gpu_layers = model . n_gpu_layers ;
2023-09-28 18:04:36 +02:00
const int32_t n_tokens = batch . n_tokens ;
const int32_t n_kv = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx : kv_self . n ;
const int32_t kv_head = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx - n_tokens : kv_self . head ;
const bool do_rope_shift = ggml_allocr_is_measure ( lctx . alloc ) | | kv_self . has_shift ;
2023-09-14 18:32:10 +02:00
auto & buf_compute = lctx . buf_compute ;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute . size ,
/*.mem_buffer =*/ buf_compute . data ,
/*.no_alloc =*/ false ,
} ;
params . no_alloc = true ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-09-28 18:04:36 +02:00
if ( batch . token ) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
2023-09-14 18:32:10 +02:00
ggml_allocr_alloc ( lctx . alloc , inp_tokens ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inp_tokens - > data , batch . token , n_tokens * ggml_element_size ( inp_tokens ) ) ;
2023-09-14 18:32:10 +02:00
}
ggml_set_name ( inp_tokens , " inp_tokens " ) ;
inpL = ggml_get_rows ( ctx0 , model . tok_embeddings , inp_tokens ) ;
} else {
# ifdef GGML_USE_MPI
GGML_ASSERT ( false & & " not implemented " ) ;
# endif
2023-09-28 18:04:36 +02:00
inpL = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_tokens ) ;
2023-09-14 18:32:10 +02:00
ggml_allocr_alloc ( lctx . alloc , inpL ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inpL - > data , batch . embd , n_tokens * n_embd * ggml_element_size ( inpL ) ) ;
2023-09-14 18:32:10 +02:00
}
}
const int i_gpu_start = n_layer - n_gpu_layers ;
( void ) i_gpu_start ;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop ; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop ;
offload_func_t offload_func_v = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( n_gpu_layers > n_layer ) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 1 ) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 2 ) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
2023-09-28 18:04:36 +02:00
// KQ_scale
2023-09-14 18:32:10 +02:00
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , 1 ) ;
2023-09-28 18:04:36 +02:00
ggml_set_name ( KQ_scale , " 1/sqrt(n_embd_head) " ) ;
2023-09-14 18:32:10 +02:00
ggml_allocr_alloc ( lctx . alloc , KQ_scale ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
ggml_set_f32 ( KQ_scale , 1.0f / sqrtf ( float ( n_embd ) / n_head ) ) ;
}
2023-09-28 18:04:36 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_kv , n_tokens , 1 ) ;
offload_func_kq ( KQ_mask ) ;
ggml_set_name ( KQ_mask , " KQ_mask " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_mask ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
float * data = ( float * ) KQ_mask - > data ;
memset ( data , 0 , ggml_nbytes ( KQ_mask ) ) ;
for ( int h = 0 ; h < 1 ; + + h ) {
for ( int j = 0 ; j < n_tokens ; + + j ) {
const llama_pos pos = batch . pos [ j ] ;
const llama_seq_id seq_id = batch . seq_id [ j ] ;
for ( int i = 0 ; i < n_kv ; + + i ) {
if ( ! kv_self . cells [ i ] . has_seq_id ( seq_id ) | | kv_self . cells [ i ] . pos > pos ) {
data [ h * ( n_kv * n_tokens ) + j * n_kv + i ] = - INFINITY ;
}
}
}
}
}
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
offload_func_kq ( KQ_pos ) ;
ggml_set_name ( KQ_pos , " KQ_pos " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_pos ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) KQ_pos - > data ;
for ( int i = 0 ; i < n_tokens ; + + i ) {
data [ i ] = batch . pos [ i ] ;
}
}
// shift the entire K-cache if needed
if ( do_rope_shift ) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_ctx ) ;
offload_func_kq ( K_shift ) ;
ggml_set_name ( K_shift , " K_shift " ) ;
ggml_allocr_alloc ( lctx . alloc , K_shift ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) K_shift - > data ;
for ( int i = 0 ; i < n_ctx ; + + i ) {
data [ i ] = kv_self . cells [ i ] . delta ;
}
}
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace ( ctx0 ,
ggml_view_3d ( ctx0 , kv_self . k ,
n_embd_head , n_head_kv , n_ctx ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ,
K_shift , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
offload_func_kq ( tmp ) ;
ggml_build_forward_expand ( gf , tmp ) ;
}
}
2023-09-14 18:32:10 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
ggml_format_name ( inpL , " layer_inp_%d " , il ) ;
offload_func_t offload_func = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( il > = i_gpu_start ) {
offload_func = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
struct ggml_tensor * inpSA = inpL ;
// norm
{
cur = ggml_rms_norm ( ctx0 , inpL , norm_rms_eps ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_0 " ) ;
// cur = cur*attn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . attn_norm ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " attention_norm_0 " ) ;
}
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * tmpk = ggml_mul_mat ( ctx0 , model . layers [ il ] . wk , cur ) ;
offload_func_kq ( tmpk ) ;
ggml_set_name ( tmpk , " tmpk " ) ;
struct ggml_tensor * tmpq = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq , cur ) ;
offload_func_kq ( tmpq ) ;
ggml_set_name ( tmpq , " tmpq " ) ;
struct ggml_tensor * Kcur ;
struct ggml_tensor * Qcur ;
switch ( model . type ) {
case MODEL_7B :
2023-09-28 18:04:36 +02:00
Kcur = ggml_rope_custom ( ctx0 , ggml_reshape_3d ( ctx0 , tmpk , n_embd_head , n_head_kv , n_tokens ) , KQ_pos , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
Qcur = ggml_rope_custom ( ctx0 , ggml_reshape_3d ( ctx0 , tmpq , n_embd_head , n_head , n_tokens ) , KQ_pos , n_embd_head , 0 , 0 , freq_base , freq_scale ) ;
2023-09-14 18:32:10 +02:00
break ;
case MODEL_13B :
2023-09-28 18:04:36 +02:00
Kcur = ggml_reshape_3d ( ctx0 , tmpk , n_embd / n_head , n_head , n_tokens ) ;
Qcur = ggml_reshape_3d ( ctx0 , tmpq , n_embd / n_head , n_head , n_tokens ) ;
2023-09-14 18:32:10 +02:00
break ;
default :
GGML_ASSERT ( false ) ;
}
offload_func_kq ( Kcur ) ;
ggml_set_name ( Kcur , " Kcur " ) ;
offload_func_kq ( Qcur ) ;
ggml_set_name ( Qcur , " Qcur " ) ;
// store key and value to memory
{
2023-09-28 18:04:36 +02:00
// compute the transposed [n_tokens, n_embd] V matrix
2023-09-14 18:32:10 +02:00
struct ggml_tensor * tmpv = ggml_mul_mat ( ctx0 , model . layers [ il ] . wv , cur ) ;
offload_func_v ( tmpv ) ;
ggml_set_name ( tmpv , " tmpv " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , tmpv , n_embd_gqa , n_tokens ) ) ;
2023-09-14 18:32:10 +02:00
offload_func_v ( Vcur ) ;
ggml_set_name ( Vcur , " Vcur " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , n_tokens * n_embd_gqa , ( ggml_element_size ( kv_self . k ) * n_embd_gqa ) * ( il * n_ctx + kv_head ) ) ;
2023-09-14 18:32:10 +02:00
offload_func_kq ( k ) ;
ggml_set_name ( k , " k " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , n_tokens , n_embd_gqa ,
2023-09-14 18:32:10 +02:00
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
2023-09-28 18:04:36 +02:00
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd_gqa + kv_head * ggml_element_size ( kv_self . v ) ) ;
2023-09-14 18:32:10 +02:00
offload_func_v ( v ) ;
ggml_set_name ( v , " v " ) ;
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
}
struct ggml_tensor * Q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
offload_func_kq ( Q ) ;
ggml_set_name ( Q , " Q " ) ;
struct ggml_tensor * K =
ggml_view_3d ( ctx0 , kv_self . k ,
2023-09-28 18:04:36 +02:00
n_embd_head , n_kv , n_head_kv ,
2023-09-14 18:32:10 +02:00
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ;
offload_func_kq ( K ) ;
ggml_set_name ( K , " K " ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
offload_func_kq ( KQ ) ;
ggml_set_name ( KQ , " KQ " ) ;
// KQ_scaled = KQ / sqrt(n_embd_head)
2023-09-28 18:04:36 +02:00
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale ( ctx0 , KQ , KQ_scale ) ;
2023-09-14 18:32:10 +02:00
offload_func_kq ( KQ_scaled ) ;
ggml_set_name ( KQ_scaled , " KQ_scaled " ) ;
struct ggml_tensor * KQ_masked ;
struct ggml_tensor * KQ_scaled_alibi ;
switch ( model . type ) {
case MODEL_7B :
2023-09-28 18:04:36 +02:00
KQ_masked = ggml_add ( ctx0 , KQ_scaled , KQ_mask ) ;
2023-09-14 18:32:10 +02:00
break ;
case MODEL_13B :
2023-09-28 18:04:36 +02:00
// TODO: replace with ggml_add()
KQ_scaled_alibi = ggml_alibi ( ctx0 , KQ_scaled , /*n_past*/ 0 , n_head , 8 ) ;
2023-09-14 18:32:10 +02:00
ggml_set_name ( KQ_scaled_alibi , " KQ_scaled_alibi " ) ;
2023-09-28 18:04:36 +02:00
KQ_masked = ggml_add ( ctx0 , KQ_scaled_alibi , KQ_mask ) ;
2023-09-14 18:32:10 +02:00
break ;
default :
GGML_ASSERT ( false ) ;
}
// KQ = soft_max(KQ_masked)
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max ( ctx0 , KQ_masked ) ;
2023-09-14 18:32:10 +02:00
offload_func_v ( KQ_soft_max ) ;
ggml_set_name ( KQ_soft_max , " KQ_soft_max " ) ;
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
2023-09-28 18:04:36 +02:00
n_kv , n_embd_head , n_head_kv ,
2023-09-14 18:32:10 +02:00
ggml_element_size ( kv_self . v ) * n_ctx ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_head ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_gqa * il ) ;
offload_func_v ( V ) ;
ggml_set_name ( V , " V " ) ;
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
offload_func_v ( KQV ) ;
ggml_set_name ( KQV , " KQV " ) ;
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
offload_func_v ( KQV_merged ) ;
ggml_set_name ( KQV_merged , " KQV_merged " ) ;
2023-09-28 18:04:36 +02:00
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d ( ctx0 , KQV_merged , n_embd , n_tokens ) ;
2023-09-14 18:32:10 +02:00
offload_func_v ( cur ) ;
ggml_set_name ( cur , " KQV_merged_contiguous " ) ;
// projection (no bias)
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . wo ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_wo " ) ;
}
struct ggml_tensor * inpFF = ggml_add ( ctx0 , cur , inpSA ) ;
offload_func ( inpFF ) ;
ggml_set_name ( inpFF , " inpFF " ) ;
// feed-forward network
{
// norm
{
cur = ggml_rms_norm ( ctx0 , inpFF , norm_rms_eps ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_1 " ) ;
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . ffn_norm ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " ffn_norm " ) ;
}
struct ggml_tensor * tmp = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w3 ,
cur ) ;
offload_func ( tmp ) ;
ggml_set_name ( tmp , " result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w1 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w1 " ) ;
// SILU activation
cur = ggml_silu ( ctx0 , cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu " ) ;
cur = ggml_mul ( ctx0 , cur , tmp ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu_x_result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w2 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w2 " ) ;
}
cur = ggml_add ( ctx0 , cur , inpFF ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " inpFF_+_result_w2 " ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
// norm
{
cur = ggml_rms_norm ( ctx0 , cur , norm_rms_eps ) ;
offload_func_nr ( cur ) ;
ggml_set_name ( cur , " rms_norm_2 " ) ;
// cur = cur*norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . output_norm ) ;
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name ( cur , " result_norm " ) ;
}
// lm_head
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
ggml_set_name ( cur , " result_output " ) ;
ggml_build_forward_expand ( gf , cur ) ;
ggml_free ( ctx0 ) ;
return gf ;
}
2023-10-04 15:23:39 +02:00
static struct ggml_cgraph * llm_build_refact (
llama_context & lctx ,
const llama_batch & batch ) {
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
const auto & cparams = lctx . cparams ;
const auto & kv_self = lctx . kv_self ;
GGML_ASSERT ( ! ! kv_self . ctx ) ;
const int64_t n_embd = hparams . n_embd ;
const int64_t n_layer = hparams . n_layer ;
const int64_t n_ctx = cparams . n_ctx ;
const int64_t n_head = hparams . n_head ;
const int64_t n_head_kv = hparams . n_head_kv ;
const int64_t n_embd_head = hparams . n_embd_head ( ) ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
const float norm_rms_eps = hparams . f_norm_rms_eps ;
const int n_gpu_layers = model . n_gpu_layers ;
const int32_t n_tokens = batch . n_tokens ;
const int32_t n_kv = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx : kv_self . n ;
const int32_t kv_head = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx - n_tokens : kv_self . head ;
// printf("n_kv = %d\n", n_kv);
auto & buf_compute = lctx . buf_compute ;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute . size ,
/*.mem_buffer =*/ buf_compute . data ,
/*.no_alloc =*/ false ,
} ;
params . no_alloc = true ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
if ( batch . token ) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
ggml_allocr_alloc ( lctx . alloc , inp_tokens ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
memcpy ( inp_tokens - > data , batch . token , n_tokens * ggml_element_size ( inp_tokens ) ) ;
}
ggml_set_name ( inp_tokens , " inp_tokens " ) ;
inpL = ggml_get_rows ( ctx0 , model . tok_embeddings , inp_tokens ) ;
} else {
# ifdef GGML_USE_MPI
GGML_ASSERT ( false & & " not implemented " ) ;
# endif
inpL = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_tokens ) ;
ggml_allocr_alloc ( lctx . alloc , inpL ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
memcpy ( inpL - > data , batch . embd , n_tokens * n_embd * ggml_element_size ( inpL ) ) ;
}
}
const int i_gpu_start = n_layer - n_gpu_layers ;
( void ) i_gpu_start ;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop ; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop ;
offload_func_t offload_func_v = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( n_gpu_layers > n_layer ) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 1 ) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 2 ) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , 1 ) ;
ggml_set_name ( KQ_scale , " 1/sqrt(n_embd_head) " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_scale ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
ggml_set_f32 ( KQ_scale , 1.0f / sqrtf ( float ( n_embd_head ) ) ) ;
}
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_kv , n_tokens , 1 ) ;
offload_func_kq ( KQ_mask ) ;
ggml_set_name ( KQ_mask , " KQ_mask " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_mask ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
float * data = ( float * ) KQ_mask - > data ;
memset ( data , 0 , ggml_nbytes ( KQ_mask ) ) ;
for ( int h = 0 ; h < 1 ; + + h ) {
for ( int j = 0 ; j < n_tokens ; + + j ) {
const llama_pos pos = batch . pos [ j ] ;
const llama_seq_id seq_id = batch . seq_id [ j ] ;
for ( int i = 0 ; i < n_kv ; + + i ) {
if ( ! kv_self . cells [ i ] . has_seq_id ( seq_id ) | | kv_self . cells [ i ] . pos > pos ) {
data [ h * ( n_kv * n_tokens ) + j * n_kv + i ] = - INFINITY ;
}
}
}
}
}
for ( int il = 0 ; il < n_layer ; + + il ) {
ggml_format_name ( inpL , " layer_inp_%d " , il ) ;
offload_func_t offload_func = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( il > = i_gpu_start ) {
offload_func = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
struct ggml_tensor * inpSA = inpL ;
// norm
{
cur = ggml_rms_norm ( ctx0 , inpL , norm_rms_eps ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_0 " ) ;
// cur = cur*attn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . attn_norm ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " attention_norm_0 " ) ;
}
// self-attention
{
// compute Q and K
struct ggml_tensor * tmpk = ggml_mul_mat ( ctx0 , model . layers [ il ] . wk , cur ) ;
offload_func_kq ( tmpk ) ;
ggml_set_name ( tmpk , " tmpk " ) ;
struct ggml_tensor * tmpq = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq , cur ) ;
offload_func_kq ( tmpq ) ;
ggml_set_name ( tmpq , " tmpq " ) ;
struct ggml_tensor * Kcur = ggml_reshape_3d ( ctx0 , tmpk , n_embd_head , n_head_kv , n_tokens ) ;
offload_func_kq ( Kcur ) ;
ggml_set_name ( Kcur , " Kcur " ) ;
struct ggml_tensor * Qcur = ggml_reshape_3d ( ctx0 , tmpq , n_embd_head , n_head , n_tokens ) ;
offload_func_kq ( Qcur ) ;
ggml_set_name ( Qcur , " Qcur " ) ;
// store key and value to memory
{
// compute the transposed [n_tokens, n_embd] V matrix
struct ggml_tensor * tmpv = ggml_mul_mat ( ctx0 , model . layers [ il ] . wv , cur ) ;
offload_func_v ( tmpv ) ;
ggml_set_name ( tmpv , " tmpv " ) ;
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , tmpv , n_embd_gqa , n_tokens ) ) ;
offload_func_v ( Vcur ) ;
ggml_set_name ( Vcur , " Vcur " ) ;
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , n_tokens * n_embd_gqa , ( ggml_element_size ( kv_self . k ) * n_embd_gqa ) * ( il * n_ctx + kv_head ) ) ;
offload_func_kq ( k ) ;
ggml_set_name ( k , " k " ) ;
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , n_tokens , n_embd_gqa ,
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd_gqa + kv_head * ggml_element_size ( kv_self . v ) ) ;
offload_func_v ( v ) ;
ggml_set_name ( v , " v " ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
}
struct ggml_tensor * Q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
offload_func_kq ( Q ) ;
ggml_set_name ( Q , " Q " ) ;
struct ggml_tensor * K =
ggml_view_3d ( ctx0 , kv_self . k ,
n_embd_head , n_kv , n_head_kv ,
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ;
offload_func_kq ( K ) ;
ggml_set_name ( K , " K " ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
offload_func_kq ( KQ ) ;
ggml_set_name ( KQ , " KQ " ) ;
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale ( ctx0 , KQ , KQ_scale ) ;
offload_func_kq ( KQ_scaled ) ;
ggml_set_name ( KQ_scaled , " KQ_scaled " ) ;
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi ( ctx0 , KQ_scaled , /*n_past*/ 0 , n_head , 8 ) ;
ggml_set_name ( KQ_scaled_alibi , " KQ_scaled_alibi " ) ;
struct ggml_tensor * KQ_masked = ggml_add ( ctx0 , KQ_scaled_alibi , KQ_mask ) ;
offload_func_kq ( KQ_masked ) ;
ggml_set_name ( KQ_masked , " KQ_masked " ) ;
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max ( ctx0 , KQ_masked ) ;
offload_func_v ( KQ_soft_max ) ;
ggml_set_name ( KQ_soft_max , " KQ_soft_max " ) ;
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
n_kv , n_embd_head , n_head_kv ,
ggml_element_size ( kv_self . v ) * n_ctx ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_head ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_gqa * il ) ;
offload_func_v ( V ) ;
ggml_set_name ( V , " V " ) ;
# if 1
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
offload_func_v ( KQV ) ;
ggml_set_name ( KQV , " KQV " ) ;
# else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way?
struct ggml_tensor * V_cont = ggml_cpy ( ctx0 , V , ggml_new_tensor_3d ( ctx0 , kv_self . v - > type , n_ctx , n_embd_head , n_head ) ) ;
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V_cont , KQ_soft_max ) ;
# endif
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
offload_func_v ( KQV_merged ) ;
ggml_set_name ( KQV_merged , " KQV_merged " ) ;
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d ( ctx0 , KQV_merged , n_embd , n_tokens ) ;
offload_func_v ( cur ) ;
ggml_set_name ( cur , " KQV_merged_contiguous " ) ;
// projection (no bias)
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . wo ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_wo " ) ;
}
struct ggml_tensor * inpFF = ggml_add ( ctx0 , cur , inpSA ) ;
offload_func ( inpFF ) ;
ggml_set_name ( inpFF , " inpFF " ) ;
// feed-forward network
{
// norm
{
cur = ggml_rms_norm ( ctx0 , inpFF , norm_rms_eps ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " rms_norm_1 " ) ;
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . ffn_norm ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " ffn_norm " ) ;
}
struct ggml_tensor * tmp = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w3 ,
cur ) ;
offload_func ( tmp ) ;
ggml_set_name ( tmp , " result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w1 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w1 " ) ;
// SILU activation
cur = ggml_silu ( ctx0 , cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu " ) ;
cur = ggml_mul ( ctx0 , cur , tmp ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " silu_x_result_w3 " ) ;
cur = ggml_mul_mat ( ctx0 ,
model . layers [ il ] . w2 ,
cur ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " result_w2 " ) ;
}
cur = ggml_add ( ctx0 , cur , inpFF ) ;
offload_func ( cur ) ;
ggml_set_name ( cur , " inpFF_+_result_w2 " ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
// norm
{
cur = ggml_rms_norm ( ctx0 , cur , norm_rms_eps ) ;
offload_func_nr ( cur ) ;
ggml_set_name ( cur , " rms_norm_2 " ) ;
// cur = cur*norm(broadcasted)
cur = ggml_mul ( ctx0 , cur , model . output_norm ) ;
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name ( cur , " result_norm " ) ;
}
// lm_head
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
ggml_set_name ( cur , " result_output " ) ;
ggml_build_forward_expand ( gf , cur ) ;
ggml_free ( ctx0 ) ;
return gf ;
}
2023-08-23 22:08:04 +02:00
static struct ggml_cgraph * llm_build_falcon (
llama_context & lctx ,
2023-09-28 18:04:36 +02:00
const llama_batch & batch ) {
2023-08-23 22:08:04 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = lctx . cparams ;
2023-08-23 22:08:04 +02:00
const auto & kv_self = lctx . kv_self ;
GGML_ASSERT ( ! ! kv_self . ctx ) ;
const int64_t n_embd = hparams . n_embd ;
const int64_t n_layer = hparams . n_layer ;
2023-09-28 21:42:38 +02:00
const int64_t n_ctx = cparams . n_ctx ;
2023-08-23 22:08:04 +02:00
const int64_t n_head = hparams . n_head ;
const int64_t n_head_kv = hparams . n_head_kv ;
const int64_t n_embd_head = hparams . n_embd_head ( ) ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2023-09-28 21:42:38 +02:00
const float freq_base = cparams . rope_freq_base ;
const float freq_scale = cparams . rope_freq_scale ;
2023-08-23 22:08:04 +02:00
const float norm_eps = hparams . f_norm_eps ;
const int n_gpu_layers = model . n_gpu_layers ;
2023-09-28 18:04:36 +02:00
const int32_t n_tokens = batch . n_tokens ;
const int32_t n_kv = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx : kv_self . n ;
const int32_t kv_head = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx - n_tokens : kv_self . head ;
const bool do_rope_shift = ggml_allocr_is_measure ( lctx . alloc ) | | kv_self . has_shift ;
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
2023-08-23 22:08:04 +02:00
auto & buf_compute = lctx . buf_compute ;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute . size ,
/*.mem_buffer =*/ buf_compute . data ,
/*.no_alloc =*/ false ,
} ;
params . no_alloc = true ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-09-28 18:04:36 +02:00
if ( batch . token ) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
2023-08-23 22:08:04 +02:00
ggml_allocr_alloc ( lctx . alloc , inp_tokens ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inp_tokens - > data , batch . token , n_tokens * ggml_element_size ( inp_tokens ) ) ;
2023-08-23 22:08:04 +02:00
}
ggml_set_name ( inp_tokens , " inp_tokens " ) ;
inpL = ggml_get_rows ( ctx0 , model . tok_embeddings , inp_tokens ) ;
} else {
# ifdef GGML_USE_MPI
GGML_ASSERT ( false & & " not implemented " ) ;
# endif
2023-09-28 18:04:36 +02:00
inpL = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_tokens ) ;
2023-08-23 22:08:04 +02:00
ggml_allocr_alloc ( lctx . alloc , inpL ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inpL - > data , batch . embd , n_tokens * n_embd * ggml_element_size ( inpL ) ) ;
2023-08-23 22:08:04 +02:00
}
}
const int i_gpu_start = n_layer - n_gpu_layers ;
( void ) i_gpu_start ;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop ; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop ;
offload_func_t offload_func_v = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( n_gpu_layers > n_layer ) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 1 ) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc ;
}
if ( n_gpu_layers > n_layer + 2 ) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
2023-09-28 18:04:36 +02:00
// KQ_scale
2023-08-23 22:08:04 +02:00
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , 1 ) ;
2023-09-28 18:04:36 +02:00
ggml_set_name ( KQ_scale , " 1/sqrt(n_embd_head) " ) ;
2023-08-23 22:08:04 +02:00
ggml_allocr_alloc ( lctx . alloc , KQ_scale ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
ggml_set_f32 ( KQ_scale , 1.0f / sqrtf ( float ( n_embd ) / n_head ) ) ;
}
2023-09-28 18:04:36 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_kv , n_tokens , 1 ) ;
offload_func_kq ( KQ_mask ) ;
ggml_set_name ( KQ_mask , " KQ_mask " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_mask ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
float * data = ( float * ) KQ_mask - > data ;
memset ( data , 0 , ggml_nbytes ( KQ_mask ) ) ;
for ( int h = 0 ; h < 1 ; + + h ) {
for ( int j = 0 ; j < n_tokens ; + + j ) {
const llama_pos pos = batch . pos [ j ] ;
const llama_seq_id seq_id = batch . seq_id [ j ] ;
for ( int i = 0 ; i < n_kv ; + + i ) {
if ( ! kv_self . cells [ i ] . has_seq_id ( seq_id ) | | kv_self . cells [ i ] . pos > pos ) {
data [ h * ( n_kv * n_tokens ) + j * n_kv + i ] = - INFINITY ;
}
}
}
}
}
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
offload_func_kq ( KQ_pos ) ;
ggml_set_name ( KQ_pos , " KQ_pos " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_pos ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) KQ_pos - > data ;
for ( int i = 0 ; i < n_tokens ; + + i ) {
data [ i ] = batch . pos [ i ] ;
}
}
// shift the entire K-cache if needed
if ( do_rope_shift ) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_ctx ) ;
offload_func_kq ( K_shift ) ;
ggml_set_name ( K_shift , " K_shift " ) ;
ggml_allocr_alloc ( lctx . alloc , K_shift ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
int * data = ( int * ) K_shift - > data ;
for ( int i = 0 ; i < n_ctx ; + + i ) {
data [ i ] = kv_self . cells [ i ] . delta ;
}
}
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace ( ctx0 ,
ggml_view_3d ( ctx0 , kv_self . k ,
n_embd_head , n_head_kv , n_ctx ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ,
K_shift , n_embd_head , 2 , 0 , freq_base , freq_scale ) ;
offload_func_kq ( tmp ) ;
ggml_build_forward_expand ( gf , tmp ) ;
}
}
2023-08-23 22:08:04 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * attn_norm ;
offload_func_t offload_func = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( il > = i_gpu_start ) {
offload_func = ggml_cuda_assign_buffers_no_alloc ;
}
# endif // GGML_USE_CUBLAS
// self-attention
// TODO: refactor into common function (shared with LLaMA)
{
attn_norm = ggml_norm ( ctx0 , inpL , norm_eps ) ;
offload_func ( attn_norm ) ;
attn_norm = ggml_add ( ctx0 ,
ggml_mul ( ctx0 , attn_norm , model . layers [ il ] . attn_norm ) ,
model . layers [ il ] . attn_norm_b ) ;
offload_func ( attn_norm - > src [ 0 ] ) ;
offload_func ( attn_norm ) ;
if ( model . layers [ il ] . attn_norm_2 ) { // Falcon-40B
cur = ggml_norm ( ctx0 , inpL , norm_eps ) ;
offload_func ( cur ) ;
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 , cur , model . layers [ il ] . attn_norm_2 ) ,
model . layers [ il ] . attn_norm_2_b ) ;
offload_func ( cur - > src [ 0 ] ) ;
offload_func ( cur ) ;
} else { // Falcon 7B
cur = attn_norm ;
}
// compute QKV
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . wqkv , cur ) ;
offload_func_kq ( cur ) ;
// Note that the strides for Kcur, Vcur are set up so that the
// resulting views are misaligned with the tensor's storage
// (by applying the K/V offset we shift the tensor's original
// view to stick out behind the viewed QKV tensor's allocated
// memory, so to say). This is ok because no actual accesses
// happen to that out-of-range memory, but it can require some
// trickery when trying to accurately dump these views for
// debugging.
const size_t wsize = ggml_type_size ( cur - > type ) ;
2023-08-27 15:40:48 +02:00
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
// non-contiguous views is added for the rope operator
struct ggml_tensor * tmpq = ggml_cont ( ctx0 , ggml_view_3d (
2023-09-28 18:04:36 +02:00
ctx0 , cur , n_embd_head , n_head , n_tokens ,
2023-08-23 22:08:04 +02:00
wsize * n_embd_head ,
wsize * n_embd_head * ( n_head + 2 * n_head_kv ) ,
2023-08-27 15:40:48 +02:00
0 ) ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( tmpq ) ;
2023-08-27 15:40:48 +02:00
struct ggml_tensor * tmpk = ggml_cont ( ctx0 , ggml_view_3d (
2023-09-28 18:04:36 +02:00
ctx0 , cur , n_embd_head , n_head_kv , n_tokens ,
2023-08-23 22:08:04 +02:00
wsize * n_embd_head ,
wsize * n_embd_head * ( n_head + 2 * n_head_kv ) ,
2023-08-27 15:40:48 +02:00
wsize * n_embd_head * n_head ) ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( tmpk ) ;
struct ggml_tensor * tmpv = ggml_view_3d (
2023-09-28 18:04:36 +02:00
ctx0 , cur , n_embd_head , n_head_kv , n_tokens ,
2023-08-23 22:08:04 +02:00
wsize * n_embd_head ,
wsize * n_embd_head * ( n_head + 2 * n_head_kv ) ,
wsize * n_embd_head * ( n_head + n_head_kv ) ) ;
offload_func_v ( tmpv ) ;
// using mode = 2 for neox mode
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Qcur = ggml_rope_custom ( ctx0 , tmpq , KQ_pos , n_embd_head , 2 , 0 , freq_base , freq_scale ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( Qcur ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Kcur = ggml_rope_custom ( ctx0 , tmpk , KQ_pos , n_embd_head , 2 , 0 , freq_base , freq_scale ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( Kcur ) ;
{
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , ggml_cont ( ctx0 , tmpv ) , n_embd_gqa , n_tokens ) ) ;
2023-08-23 22:08:04 +02:00
offload_func_v ( Vcur ) ;
offload_func_v ( Vcur - > src [ 0 ] - > src [ 0 ] ) ;
ggml_set_name ( Vcur , " Vcur " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , n_tokens * n_embd_gqa , ( ggml_element_size ( kv_self . k ) * n_embd_gqa ) * ( il * n_ctx + kv_head ) ) ;
2023-08-23 22:08:04 +02:00
offload_func_kq ( k ) ;
ggml_set_name ( k , " k " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , n_tokens , n_embd_gqa ,
2023-08-23 22:08:04 +02:00
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
2023-09-28 18:04:36 +02:00
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd_gqa + kv_head * ggml_element_size ( kv_self . v ) ) ;
2023-08-23 22:08:04 +02:00
offload_func_v ( v ) ;
2023-03-22 06:32:36 +01:00
2023-07-26 15:56:53 +02:00
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
struct ggml_tensor * Q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( Q ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( Q , " Q " ) ;
2023-03-22 06:32:36 +01:00
struct ggml_tensor * K =
2023-08-17 09:47:09 +02:00
ggml_view_3d ( ctx0 , kv_self . k ,
2023-09-28 18:04:36 +02:00
n_embd_head , n_kv , n_head_kv ,
2023-08-17 09:47:09 +02:00
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( K ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( K , " K " ) ;
2023-03-22 06:32:36 +01:00
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( KQ ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQ , " KQ " ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_scaled = ggml_scale ( ctx0 , KQ , KQ_scale ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( KQ_scaled ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQ_scaled , " KQ_scaled " ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_masked = ggml_add ( ctx0 , KQ_scaled , KQ_mask ) ;
2023-06-14 19:47:19 +02:00
offload_func_kq ( KQ_masked ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQ_masked , " KQ_masked " ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max ( ctx0 , KQ_masked ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( KQ_soft_max ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQ_soft_max , " KQ_soft_max " ) ;
2023-03-22 06:32:36 +01:00
2023-04-05 21:07:33 +02:00
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
2023-09-28 18:04:36 +02:00
n_kv , n_embd_head , n_head_kv ,
2023-08-17 09:47:09 +02:00
ggml_element_size ( kv_self . v ) * n_ctx ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_head ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_gqa * il ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( V ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( V , " V " ) ;
2023-03-22 06:32:36 +01:00
2023-04-05 21:07:33 +02:00
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( KQV ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQV , " KQV " ) ;
2023-03-22 06:32:36 +01:00
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( KQV_merged ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( KQV_merged , " KQV_merged " ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
cur = ggml_cont_2d ( ctx0 , KQV_merged , n_embd , n_tokens ) ;
2023-06-14 19:47:19 +02:00
offload_func_v ( cur ) ;
2023-05-02 16:03:00 +02:00
ggml_set_name ( cur , " KQV_merged_contiguous " ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . wo , cur ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
ggml_set_name ( cur , " result_wo " ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
struct ggml_tensor * attn_out = cur ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
// feed forward
2023-03-22 06:32:36 +01:00
{
2023-08-23 22:08:04 +02:00
struct ggml_tensor * inpFF = attn_norm ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . w3 , inpFF ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
cur = ggml_gelu ( ctx0 , cur ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
2023-08-23 22:08:04 +02:00
cur = ggml_mul_mat ( ctx0 , model . layers [ il ] . w2 , cur ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
cur = ggml_add ( ctx0 , cur , attn_out ) ;
offload_func ( cur ) ;
cur = ggml_add ( ctx0 , cur , inpL ) ;
2023-06-06 21:33:23 +02:00
offload_func ( cur ) ;
2023-03-22 06:32:36 +01:00
// input for next layer
inpL = cur ;
}
2023-08-23 22:08:04 +02:00
cur = inpL ;
2023-03-22 06:32:36 +01:00
// norm
{
2023-08-23 22:08:04 +02:00
cur = ggml_norm ( ctx0 , cur , norm_eps ) ;
2023-06-14 19:47:19 +02:00
offload_func_nr ( cur ) ;
2023-03-24 22:17:37 +01:00
2023-08-23 22:08:04 +02:00
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 , cur , model . output_norm ) ,
model . output_norm_b ) ;
2023-06-06 21:33:23 +02:00
ggml_set_name ( cur , " result_norm " ) ;
2023-03-22 06:32:36 +01:00
}
2023-06-04 22:34:30 +02:00
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
2023-06-06 21:33:23 +02:00
ggml_set_name ( cur , " result_output " ) ;
2023-03-24 22:17:37 +01:00
2023-07-26 15:56:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-06-04 22:34:30 +02:00
2023-07-30 15:58:01 +02:00
ggml_free ( ctx0 ) ;
return gf ;
}
2023-09-15 21:02:13 +02:00
static struct ggml_cgraph * llm_build_starcoder (
llama_context & lctx ,
2023-09-28 18:04:36 +02:00
const llama_batch & batch ) {
2023-09-15 21:02:13 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = lctx . cparams ;
2023-09-15 21:02:13 +02:00
const auto & kv_self = lctx . kv_self ;
GGML_ASSERT ( ! ! kv_self . ctx ) ;
const int64_t n_embd = hparams . n_embd ;
const int64_t n_layer = hparams . n_layer ;
2023-09-28 21:42:38 +02:00
const int64_t n_ctx = cparams . n_ctx ;
2023-09-15 21:02:13 +02:00
const int64_t n_head = hparams . n_head ;
const int64_t n_head_kv = hparams . n_head_kv ;
const int64_t n_embd_head = hparams . n_embd_head ( ) ;
const int64_t n_embd_gqa = hparams . n_embd_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2023-09-28 18:04:36 +02:00
const float norm_eps = hparams . f_norm_eps ;
const int32_t n_tokens = batch . n_tokens ;
const int32_t n_kv = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx : kv_self . n ;
const int32_t kv_head = ggml_allocr_is_measure ( lctx . alloc ) ? n_ctx - n_tokens : kv_self . head ;
2023-09-15 21:02:13 +02:00
auto & buf_compute = lctx . buf_compute ;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute . size ,
/*.mem_buffer =*/ buf_compute . data ,
/*.no_alloc =*/ false ,
} ;
params . no_alloc = true ;
struct ggml_context * ctx0 = ggml_init ( params ) ;
ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * token ;
struct ggml_tensor * position ;
struct ggml_tensor * inpL ;
2023-09-28 18:04:36 +02:00
if ( batch . token ) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
2023-09-15 21:02:13 +02:00
ggml_allocr_alloc ( lctx . alloc , inp_tokens ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( inp_tokens - > data , batch . token , n_tokens * ggml_element_size ( inp_tokens ) ) ;
2023-09-15 21:02:13 +02:00
}
ggml_set_name ( inp_tokens , " inp_tokens " ) ;
token = ggml_get_rows ( ctx0 , model . tok_embeddings , inp_tokens ) ;
} else {
# ifdef GGML_USE_MPI
GGML_ASSERT ( false & & " not implemented " ) ;
# endif
2023-09-28 18:04:36 +02:00
token = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_tokens ) ;
2023-09-15 21:02:13 +02:00
ggml_allocr_alloc ( lctx . alloc , token ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
memcpy ( token - > data , batch . embd , n_tokens * n_embd * ggml_element_size ( token ) ) ;
2023-09-15 21:02:13 +02:00
}
}
{
// Compute position embeddings.
2023-09-28 18:04:36 +02:00
struct ggml_tensor * inp_positions = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
2023-09-15 21:02:13 +02:00
ggml_allocr_alloc ( lctx . alloc , inp_positions ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
2023-09-28 18:04:36 +02:00
for ( int i = 0 ; i < n_tokens ; + + i ) {
( ( int32_t * ) inp_positions - > data ) [ i ] = batch . pos [ i ] ;
2023-09-15 21:02:13 +02:00
}
}
ggml_set_name ( inp_positions , " inp_positions " ) ;
position = ggml_get_rows ( ctx0 , model . pos_embeddings , inp_positions ) ;
}
2023-09-28 18:04:36 +02:00
// KQ_scale
2023-09-15 21:02:13 +02:00
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , 1 ) ;
2023-09-28 18:04:36 +02:00
ggml_set_name ( KQ_scale , " 1/sqrt(n_embd_head) " ) ;
2023-09-15 21:02:13 +02:00
ggml_allocr_alloc ( lctx . alloc , KQ_scale ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
ggml_set_f32 ( KQ_scale , 1.0f / sqrtf ( float ( n_embd ) / n_head ) ) ;
}
2023-09-28 18:04:36 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_kv , n_tokens , 1 ) ;
ggml_set_name ( KQ_mask , " KQ_mask " ) ;
ggml_allocr_alloc ( lctx . alloc , KQ_mask ) ;
if ( ! ggml_allocr_is_measure ( lctx . alloc ) ) {
float * data = ( float * ) KQ_mask - > data ;
memset ( data , 0 , ggml_nbytes ( KQ_mask ) ) ;
for ( int h = 0 ; h < 1 ; + + h ) {
for ( int j = 0 ; j < n_tokens ; + + j ) {
const llama_pos pos = batch . pos [ j ] ;
const llama_seq_id seq_id = batch . seq_id [ j ] ;
for ( int i = 0 ; i < n_kv ; + + i ) {
if ( ! kv_self . cells [ i ] . has_seq_id ( seq_id ) | | kv_self . cells [ i ] . pos > pos ) {
data [ h * ( n_kv * n_tokens ) + j * n_kv + i ] = - INFINITY ;
}
}
}
}
}
2023-09-15 21:02:13 +02:00
inpL = ggml_add ( ctx0 , token , position ) ;
ggml_set_name ( inpL , " inpL " ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
{
// Norm
cur = ggml_norm ( ctx0 , inpL , norm_eps ) ;
cur = ggml_add ( ctx0 , ggml_mul ( ctx0 , cur , model . layers [ il ] . attn_norm ) , model . layers [ il ] . attn_norm_b ) ;
}
{
// Self Attention
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . wqkv , cur ) , model . layers [ il ] . bqkv ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * tmpq = ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * n_embd ) ;
struct ggml_tensor * tmpk = ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , sizeof ( float ) * n_embd ) ;
struct ggml_tensor * tmpv = ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ;
2023-09-15 21:02:13 +02:00
struct ggml_tensor * Qcur = tmpq ;
struct ggml_tensor * Kcur = tmpk ;
{
2023-09-28 18:04:36 +02:00
struct ggml_tensor * Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , ggml_cont ( ctx0 , tmpv ) , n_embd_gqa , n_tokens ) ) ;
2023-09-15 21:02:13 +02:00
ggml_set_name ( Vcur , " Vcur " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , n_tokens * n_embd_gqa , ( ggml_element_size ( kv_self . k ) * n_embd_gqa ) * ( il * n_ctx + kv_head ) ) ;
2023-09-15 21:02:13 +02:00
ggml_set_name ( k , " k " ) ;
2023-09-28 18:04:36 +02:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , n_tokens , n_embd_gqa ,
2023-09-15 21:02:13 +02:00
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
2023-09-28 18:04:36 +02:00
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_embd_gqa + kv_head * ggml_element_size ( kv_self . v ) ) ;
2023-09-15 21:02:13 +02:00
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
}
struct ggml_tensor * Q =
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
Qcur ,
2023-09-28 18:04:36 +02:00
ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_embd_head , n_head , n_tokens ) ) ,
2023-09-15 21:02:13 +02:00
0 , 2 , 1 , 3 ) ;
ggml_set_name ( Q , " Q " ) ;
struct ggml_tensor * K =
ggml_view_3d ( ctx0 , kv_self . k ,
2023-09-28 18:04:36 +02:00
n_embd_head , n_kv , n_head_kv ,
2023-09-15 21:02:13 +02:00
ggml_element_size ( kv_self . k ) * n_embd_gqa ,
ggml_element_size ( kv_self . k ) * n_embd_head ,
ggml_element_size ( kv_self . k ) * n_embd_gqa * n_ctx * il ) ;
ggml_set_name ( K , " K " ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
ggml_set_name ( KQ , " KQ " ) ;
// KQ_scaled = KQ / sqrt(n_embd_head)
2023-09-28 18:04:36 +02:00
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
2023-09-15 21:02:13 +02:00
struct ggml_tensor * KQ_scaled = ggml_scale_inplace ( ctx0 , KQ , KQ_scale ) ;
ggml_set_name ( KQ_scaled , " KQ_scaled " ) ;
// KQ_masked = mask_past(KQ_scaled)
2023-09-28 18:04:36 +02:00
struct ggml_tensor * KQ_masked = ggml_add ( ctx0 , KQ_scaled , KQ_mask ) ;
2023-09-15 21:02:13 +02:00
ggml_set_name ( KQ_masked , " KQ_masked " ) ;
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace ( ctx0 , KQ_masked ) ;
ggml_set_name ( KQ_soft_max , " KQ_soft_max " ) ;
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
2023-09-28 18:04:36 +02:00
n_kv , n_embd_head , n_head_kv ,
2023-09-15 21:02:13 +02:00
ggml_element_size ( kv_self . v ) * n_ctx ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_head ,
ggml_element_size ( kv_self . v ) * n_ctx * n_embd_gqa * il ) ;
ggml_set_name ( V , " V " ) ;
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
ggml_set_name ( KQV , " KQV " ) ;
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
ggml_set_name ( KQV_merged , " KQV_merged " ) ;
2023-09-28 18:04:36 +02:00
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d ( ctx0 , KQV_merged , n_embd , n_tokens ) ;
2023-09-15 21:02:13 +02:00
ggml_set_name ( cur , " KQV_merged_contiguous " ) ;
}
// Projection
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . wo , cur ) , model . layers [ il ] . bo ) ;
// Add the input
cur = ggml_add ( ctx0 , cur , inpL ) ;
struct ggml_tensor * inpFF = cur ;
// FF
{
// Norm
{
cur = ggml_norm ( ctx0 , inpFF , norm_eps ) ;
cur = ggml_add ( ctx0 , ggml_mul ( ctx0 , cur , model . layers [ il ] . ffn_norm ) , model . layers [ il ] . ffn_norm_b ) ;
}
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . w3 , cur ) , model . layers [ il ] . b3 ) ;
// GELU activation
cur = ggml_gelu ( ctx0 , cur ) ;
// Projection
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . layers [ il ] . w2 , cur ) , model . layers [ il ] . b2 ) ;
}
inpL = ggml_add ( ctx0 , cur , inpFF ) ;
}
// Output Norm
{
cur = ggml_norm ( ctx0 , inpL , norm_eps ) ;
cur = ggml_add ( ctx0 , ggml_mul ( ctx0 , cur , model . output_norm ) , model . output_norm_b ) ;
}
ggml_set_name ( cur , " result_norm " ) ;
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
ggml_set_name ( cur , " result_output " ) ;
ggml_build_forward_expand ( gf , cur ) ;
ggml_free ( ctx0 ) ;
return gf ;
}
2023-08-23 22:08:04 +02:00
static struct ggml_cgraph * llama_build_graph (
llama_context & lctx ,
2023-09-28 18:04:36 +02:00
const llama_batch & batch ) {
2023-08-23 22:08:04 +02:00
const auto & model = lctx . model ;
struct ggml_cgraph * result = NULL ;
switch ( model . arch ) {
case LLM_ARCH_LLAMA :
{
2023-09-28 18:04:36 +02:00
result = llm_build_llama ( lctx , batch ) ;
2023-08-23 22:08:04 +02:00
} break ;
2023-09-14 18:32:10 +02:00
case LLM_ARCH_BAICHUAN :
{
2023-09-28 18:04:36 +02:00
result = llm_build_baichaun ( lctx , batch ) ;
2023-09-14 18:32:10 +02:00
} break ;
2023-08-23 22:08:04 +02:00
case LLM_ARCH_FALCON :
{
2023-09-28 18:04:36 +02:00
result = llm_build_falcon ( lctx , batch ) ;
2023-08-23 22:08:04 +02:00
} break ;
2023-09-15 21:02:13 +02:00
case LLM_ARCH_STARCODER :
{
2023-09-28 18:04:36 +02:00
result = llm_build_starcoder ( lctx , batch ) ;
2023-09-15 21:02:13 +02:00
} break ;
2023-10-04 15:23:39 +02:00
case LLM_ARCH_REFACT :
{
result = llm_build_refact ( lctx , batch ) ;
} break ;
2023-08-23 22:08:04 +02:00
default :
GGML_ASSERT ( false ) ;
2023-09-28 23:41:44 +02:00
}
2023-08-23 22:08:04 +02:00
return result ;
}
2023-09-28 18:04:36 +02:00
// decode a batch of tokens by evaluating the transformer
2023-07-30 15:58:01 +02:00
//
// - lctx: llama context
2023-09-28 18:04:36 +02:00
// - batch: batch to evaluate
2023-07-30 15:58:01 +02:00
// - n_threads: number of threads to use
//
2023-09-28 18:04:36 +02:00
// return 0 on success
// return positive int on warning
// return negative int on error
//
static int llama_decode_internal (
2023-07-30 15:58:01 +02:00
llama_context & lctx ,
2023-09-28 21:42:38 +02:00
llama_batch batch ) {
2023-09-28 18:04:36 +02:00
const uint32_t n_tokens = batch . n_tokens ;
2023-07-30 15:58:01 +02:00
2023-09-28 18:04:36 +02:00
if ( n_tokens = = 0 ) {
LLAMA_LOG_ERROR ( " %s: n_tokens == 0 " , __func__ ) ;
return - 1 ;
}
2023-07-30 15:58:01 +02:00
2023-09-28 21:42:38 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
const auto & cparams = lctx . cparams ;
const auto n_batch = cparams . n_batch ;
GGML_ASSERT ( n_tokens < = n_batch ) ;
int n_threads = n_tokens = = 1 ? cparams . n_threads : cparams . n_threads_batch ;
2023-09-28 18:04:36 +02:00
GGML_ASSERT ( ( ! batch . token & & batch . embd ) | | ( batch . token & & ! batch . embd ) ) ; // NOLINT
2023-08-18 12:44:58 +02:00
2023-07-30 15:58:01 +02:00
const int64_t t_start_us = ggml_time_us ( ) ;
# ifdef GGML_USE_MPI
2023-09-28 18:04:36 +02:00
// TODO: needs fix after #3228
GGML_ASSERT ( false & & " not implemented " ) ;
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
2023-07-30 15:58:01 +02:00
# endif
2023-08-27 17:55:41 +02:00
GGML_ASSERT ( n_threads > 0 ) ;
2023-09-28 18:04:36 +02:00
auto & kv_self = lctx . kv_self ;
2023-07-30 15:58:01 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ! ! kv_self . ctx ) ;
2023-07-30 15:58:01 +02:00
2023-08-23 22:08:04 +02:00
const int64_t n_embd = hparams . n_embd ;
const int64_t n_vocab = hparams . n_vocab ;
2023-07-30 15:58:01 +02:00
2023-09-28 18:04:36 +02:00
// helpers for smoother batch API transistion
// after deprecating the llama_eval calls, these will be removed
std : : vector < llama_pos > pos ;
std : : vector < llama_seq_id > seq_id ;
if ( batch . pos = = nullptr ) {
pos . resize ( n_tokens ) ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
pos [ i ] = batch . all_pos_0 + i * batch . all_pos_1 ;
}
batch . pos = pos . data ( ) ;
}
if ( batch . seq_id = = nullptr ) {
seq_id . resize ( n_tokens ) ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
seq_id [ i ] = batch . all_seq_id ;
}
batch . seq_id = seq_id . data ( ) ;
}
// we always start to search for a free slot from the start of the cache
// TODO: better strategies can be implemented
kv_self . head = 0 ;
if ( ! llama_kv_cache_find_slot ( kv_self , batch ) ) {
return 1 ;
}
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
2023-09-28 21:42:38 +02:00
kv_self . n = std : : min ( ( int32_t ) cparams . n_ctx , std : : max ( 32 , llama_kv_cache_cell_max ( kv_self ) ) ) ;
2023-09-28 18:04:36 +02:00
//printf("kv_self.n = %d\n", kv_self.n);
2023-07-30 15:58:01 +02:00
ggml_allocr_reset ( lctx . alloc ) ;
2023-09-28 18:04:36 +02:00
ggml_cgraph * gf = llama_build_graph ( lctx , batch ) ;
2023-07-30 15:58:01 +02:00
ggml_allocr_alloc_graph ( lctx . alloc , gf ) ;
2023-08-22 15:25:19 +02:00
# ifdef GGML_USE_CUBLAS
for ( int i = 0 ; i < gf - > n_leafs ; i + + ) {
ggml_tensor * node = gf - > leafs [ i ] ;
if ( node - > backend = = GGML_BACKEND_GPU & & node - > extra = = NULL ) {
ggml_cuda_assign_scratch_offset ( node , ( char * ) node - > data - ( char * ) lctx . buf_alloc . data ) ;
2023-09-28 18:04:36 +02:00
ggml_cuda_copy_to_device ( node ) ;
2023-08-22 15:25:19 +02:00
}
}
for ( int i = 0 ; i < gf - > n_nodes ; i + + ) {
ggml_tensor * node = gf - > nodes [ i ] ;
if ( node - > backend = = GGML_BACKEND_GPU & & node - > extra = = NULL ) {
ggml_cuda_assign_scratch_offset ( node , ( char * ) node - > data - ( char * ) lctx . buf_alloc . data ) ;
}
}
2023-09-28 21:42:38 +02:00
ggml_cuda_set_mul_mat_q ( cparams . mul_mat_q ) ;
2023-07-30 15:58:01 +02:00
# endif
2023-08-09 22:46:40 +02:00
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
2023-07-30 15:58:01 +02:00
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2023-09-05 09:46:39 +02:00
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
2023-09-28 18:04:36 +02:00
if ( n_tokens > = 32 & & ggml_cpu_has_blas ( ) & & ! ggml_cpu_has_gpublas ( ) ) {
2023-09-05 09:46:39 +02:00
n_threads = std : : min ( 4 , n_threads ) ;
}
2023-07-25 14:32:20 +02:00
2023-09-21 10:43:53 +02:00
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
const bool full_offload_supported = model . arch = = LLM_ARCH_LLAMA | |
model . arch = = LLM_ARCH_BAICHUAN | |
2023-10-04 15:23:39 +02:00
model . arch = = LLM_ARCH_FALCON | |
model . arch = = LLM_ARCH_REFACT ;
2023-09-21 10:43:53 +02:00
const bool fully_offloaded = model . n_gpu_layers > = ( int ) hparams . n_layer + 3 ;
if ( ggml_cpu_has_cublas ( ) & & full_offload_supported & & fully_offloaded ) {
n_threads = 1 ;
}
2023-08-23 22:08:04 +02:00
struct ggml_tensor * res = gf - > nodes [ gf - > n_nodes - 1 ] ;
2023-07-31 11:02:53 +02:00
struct ggml_tensor * embeddings = gf - > nodes [ gf - > n_nodes - 2 ] ;
2023-08-23 22:08:04 +02:00
GGML_ASSERT ( strcmp ( res - > name , " result_output " ) = = 0 ) ;
GGML_ASSERT ( strcmp ( embeddings - > name , " result_norm " ) = = 0 ) ;
2023-07-31 11:02:53 +02:00
2023-07-10 17:49:56 +02:00
# if GGML_USE_MPI
2023-07-30 15:58:01 +02:00
const int64_t n_layer = hparams . n_layer ;
2023-07-26 15:56:53 +02:00
ggml_mpi_graph_compute_pre ( lctx . ctx_mpi , gf , n_layer ) ;
2023-07-10 17:49:56 +02:00
# endif
2023-06-04 22:34:30 +02:00
# ifdef GGML_USE_METAL
2023-08-16 22:07:04 +02:00
if ( lctx . ctx_metal ) {
2023-07-07 18:24:01 +02:00
ggml_metal_set_n_cb ( lctx . ctx_metal , n_threads ) ;
2023-07-26 15:56:53 +02:00
ggml_metal_graph_compute ( lctx . ctx_metal , gf ) ;
2023-06-04 22:34:30 +02:00
} else {
2023-07-26 15:56:53 +02:00
ggml_graph_compute_helper ( lctx . work_buffer , gf , n_threads ) ;
2023-06-04 22:34:30 +02:00
}
# else
2023-07-26 15:56:53 +02:00
ggml_graph_compute_helper ( lctx . work_buffer , gf , n_threads ) ;
2023-06-04 22:34:30 +02:00
# endif
2023-07-10 17:49:56 +02:00
# if GGML_USE_MPI
2023-07-26 15:56:53 +02:00
ggml_mpi_graph_compute_post ( lctx . ctx_mpi , gf , n_layer ) ;
2023-07-10 17:49:56 +02:00
# endif
2023-09-28 18:04:36 +02:00
// update the kv ring buffer
lctx . kv_self . head + = n_tokens ;
lctx . kv_self . has_shift = false ;
2023-03-22 06:32:36 +01:00
2023-04-23 17:15:39 +02:00
# ifdef GGML_PERF
2023-04-05 21:07:33 +02:00
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
2023-07-26 15:56:53 +02:00
ggml_graph_print ( gf ) ;
2023-04-23 17:15:39 +02:00
# endif
2023-04-05 21:07:33 +02:00
// plot the computation graph in dot format (for debugging purposes)
2023-03-22 06:32:36 +01:00
//if (n_past%100 == 0) {
2023-07-26 15:56:53 +02:00
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
2023-03-22 06:32:36 +01:00
//}
2023-03-24 16:05:13 +01:00
// extract logits
{
auto & logits_out = lctx . logits ;
2023-09-28 18:04:36 +02:00
if ( batch . logits ) {
logits_out . resize ( n_vocab * n_tokens ) ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
if ( batch . logits [ i ] = = 0 ) {
continue ;
}
memcpy ( logits_out . data ( ) + ( n_vocab * i ) , ( float * ) ggml_get_data ( res ) + ( n_vocab * i ) , sizeof ( float ) * n_vocab ) ;
}
} else if ( lctx . logits_all ) {
logits_out . resize ( n_vocab * n_tokens ) ;
memcpy ( logits_out . data ( ) , ( float * ) ggml_get_data ( res ) , sizeof ( float ) * n_vocab * n_tokens ) ;
2023-03-24 16:05:13 +01:00
} else {
logits_out . resize ( n_vocab ) ;
2023-09-28 18:04:36 +02:00
memcpy ( logits_out . data ( ) , ( float * ) ggml_get_data ( res ) + ( n_vocab * ( n_tokens - 1 ) ) , sizeof ( float ) * n_vocab ) ;
2023-03-24 16:05:13 +01:00
}
}
// extract embeddings
2023-05-13 10:23:15 +02:00
if ( ! lctx . embedding . empty ( ) ) {
2023-03-24 16:05:13 +01:00
auto & embedding_out = lctx . embedding ;
2023-03-22 06:32:36 +01:00
2023-03-24 16:05:13 +01:00
embedding_out . resize ( n_embd ) ;
2023-09-28 18:04:36 +02:00
memcpy ( embedding_out . data ( ) , ( float * ) ggml_get_data ( embeddings ) + ( n_embd * ( n_tokens - 1 ) ) , sizeof ( float ) * n_embd ) ;
2023-03-22 06:32:36 +01:00
}
// measure the performance only for the single-token evals
2023-09-28 18:04:36 +02:00
if ( n_tokens = = 1 ) {
2023-03-22 06:32:36 +01:00
lctx . t_eval_us + = ggml_time_us ( ) - t_start_us ;
lctx . n_eval + + ;
}
2023-09-28 18:04:36 +02:00
else if ( n_tokens > 1 ) {
2023-03-25 15:34:23 +01:00
lctx . t_p_eval_us + = ggml_time_us ( ) - t_start_us ;
2023-09-28 18:04:36 +02:00
lctx . n_p_eval + = n_tokens ;
2023-03-25 15:34:23 +01:00
}
2023-03-22 06:32:36 +01:00
2023-09-28 18:04:36 +02:00
// get a more accurate load time, upon first eval
// TODO: fix this
if ( ! lctx . has_evaluated_once ) {
lctx . t_load_us = ggml_time_us ( ) - lctx . t_start_us ;
lctx . has_evaluated_once = true ;
}
return 0 ;
2023-03-22 06:32:36 +01:00
}
//
// tokenizer
//
2023-08-21 22:07:43 +02:00
static enum llama_vocab_type llama_vocab_get_type ( const llama_vocab & vocab ) {
return vocab . type ;
}
static bool llama_is_normal_token ( const llama_vocab & vocab , llama_token id ) {
return vocab . id_to_token [ id ] . type = = LLAMA_TOKEN_TYPE_NORMAL ;
}
static bool llama_is_unknown_token ( const llama_vocab & vocab , llama_token id ) {
return vocab . id_to_token [ id ] . type = = LLAMA_TOKEN_TYPE_UNKNOWN ;
}
static bool llama_is_control_token ( const llama_vocab & vocab , llama_token id ) {
return vocab . id_to_token [ id ] . type = = LLAMA_TOKEN_TYPE_CONTROL ;
}
static bool llama_is_byte_token ( const llama_vocab & vocab , llama_token id ) {
return vocab . id_to_token [ id ] . type = = LLAMA_TOKEN_TYPE_BYTE ;
}
2023-10-03 09:16:26 +02:00
static bool llama_is_user_defined_token ( const llama_vocab & vocab , llama_token id ) {
return vocab . id_to_token [ id ] . type = = LLAMA_TOKEN_TYPE_USER_DEFINED ;
}
static uint8_t llama_token_to_byte ( const llama_vocab & vocab , llama_token id ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( llama_is_byte_token ( vocab , id ) ) ;
const auto & token_data = vocab . id_to_token . at ( id ) ;
2023-10-03 09:16:26 +02:00
switch ( llama_vocab_get_type ( vocab ) ) {
case LLAMA_VOCAB_TYPE_SPM : {
auto buf = token_data . text . substr ( 3 , 2 ) ;
return strtol ( buf . c_str ( ) , NULL , 16 ) ;
}
case LLAMA_VOCAB_TYPE_BPE : {
GGML_ASSERT ( false ) ;
return unicode_to_bytes_bpe ( token_data . text ) ;
}
default :
GGML_ASSERT ( false ) ;
}
2023-08-21 22:07:43 +02:00
}
static llama_token llama_byte_to_token ( const llama_vocab & vocab , uint8_t ch ) {
2023-10-03 09:16:26 +02:00
switch ( llama_vocab_get_type ( vocab ) ) {
case LLAMA_VOCAB_TYPE_SPM : {
char buf [ 7 ] ;
int result = snprintf ( buf , sizeof ( buf ) , " <0x%02X> " , ch ) ;
GGML_ASSERT ( 0 < = result & & result < 7 ) ;
return vocab . token_to_id . at ( buf ) ;
}
case LLAMA_VOCAB_TYPE_BPE : {
return vocab . token_to_id . at ( bytes_to_unicode_bpe ( ch ) ) ;
}
default :
GGML_ASSERT ( false ) ;
}
2023-08-21 22:07:43 +02:00
}
2023-08-27 13:19:19 +02:00
static void llama_escape_whitespace ( std : : string & text ) {
replace_all ( text , " " , " \xe2 \x96 \x81 " ) ;
2023-08-21 22:07:43 +02:00
}
2023-08-24 11:26:01 +02:00
static void llama_unescape_whitespace ( std : : string & word ) {
replace_all ( word , " \xe2 \x96 \x81 " , " " ) ;
2023-08-21 22:07:43 +02:00
}
2023-08-23 22:08:04 +02:00
struct llm_symbol {
2023-03-22 06:32:36 +01:00
using index = int ;
index prev ;
index next ;
const char * text ;
size_t n ;
} ;
2023-08-23 22:08:04 +02:00
static_assert ( std : : is_trivially_copyable < llm_symbol > : : value , " llm_symbol is not trivially copyable " ) ;
// SPM tokenizer
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
2023-05-13 10:23:15 +02:00
2023-08-23 22:08:04 +02:00
struct llm_bigram_spm {
2023-03-22 06:32:36 +01:00
struct comparator {
2023-08-23 22:08:04 +02:00
bool operator ( ) ( llm_bigram_spm & l , llm_bigram_spm & r ) {
2023-03-22 06:32:36 +01:00
return ( l . score < r . score ) | | ( l . score = = r . score & & l . left > r . left ) ;
}
} ;
2023-08-23 22:08:04 +02:00
using queue_storage = std : : vector < llm_bigram_spm > ;
using queue = std : : priority_queue < llm_bigram_spm , queue_storage , comparator > ;
llm_symbol : : index left ;
llm_symbol : : index right ;
2023-03-22 06:32:36 +01:00
float score ;
size_t size ;
} ;
2023-08-23 22:08:04 +02:00
struct llm_tokenizer_spm {
llm_tokenizer_spm ( const llama_vocab & vocab ) : vocab ( vocab ) { }
2023-03-22 06:32:36 +01:00
void tokenize ( const std : : string & text , std : : vector < llama_vocab : : id > & output ) {
// split string into utf8 chars
int index = 0 ;
size_t offs = 0 ;
while ( offs < text . size ( ) ) {
2023-08-23 22:08:04 +02:00
llm_symbol sym ;
2023-08-21 22:07:43 +02:00
size_t len = utf8_len ( text [ offs ] ) ;
2023-03-22 06:32:36 +01:00
sym . text = text . c_str ( ) + offs ;
2023-09-13 15:19:44 +02:00
sym . n = std : : min ( len , text . size ( ) - offs ) ;
offs + = sym . n ;
2023-03-22 06:32:36 +01:00
sym . prev = index - 1 ;
sym . next = offs = = text . size ( ) ? - 1 : index + 1 ;
index + + ;
2023-08-23 22:08:04 +02:00
symbols . emplace_back ( sym ) ;
2023-03-22 06:32:36 +01:00
}
// seed the work queue with all possible 2-character tokens.
2023-08-23 22:08:04 +02:00
for ( size_t i = 1 ; i < symbols . size ( ) ; + + i ) {
2023-03-22 06:32:36 +01:00
try_add_bigram ( i - 1 , i ) ;
}
// keep substituting the highest frequency pairs for as long as we can.
2023-08-23 22:08:04 +02:00
while ( ! work_queue . empty ( ) ) {
auto bigram = work_queue . top ( ) ;
work_queue . pop ( ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
auto & left_sym = symbols [ bigram . left ] ;
auto & right_sym = symbols [ bigram . right ] ;
2023-03-22 06:32:36 +01:00
// if one of the symbols already got merged, skip it.
if ( left_sym . n = = 0 | | right_sym . n = = 0 | |
left_sym . n + right_sym . n ! = bigram . size ) {
continue ;
}
// merge the right sym into the left one
left_sym . n + = right_sym . n ;
right_sym . n = 0 ;
2023-08-09 22:46:40 +02:00
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2023-03-22 06:32:36 +01:00
// remove the right sym from the chain
left_sym . next = right_sym . next ;
if ( right_sym . next > = 0 ) {
2023-08-23 22:08:04 +02:00
symbols [ right_sym . next ] . prev = bigram . left ;
2023-03-22 06:32:36 +01:00
}
// find more substitutions
try_add_bigram ( left_sym . prev , bigram . left ) ;
try_add_bigram ( bigram . left , left_sym . next ) ;
}
2023-08-23 22:08:04 +02:00
for ( int i = 0 ; i ! = - 1 ; i = symbols [ i ] . next ) {
auto & symbol = symbols [ i ] ;
2023-08-21 22:07:43 +02:00
resegment ( symbol , output ) ;
2023-03-22 06:32:36 +01:00
}
}
private :
2023-08-23 22:08:04 +02:00
void resegment ( llm_symbol & symbol , std : : vector < llama_vocab : : id > & output ) {
2023-08-21 22:07:43 +02:00
auto text = std : : string ( symbol . text , symbol . n ) ;
2023-08-23 22:08:04 +02:00
auto token = vocab . token_to_id . find ( text ) ;
2023-08-21 22:07:43 +02:00
// Do we need to support is_unused?
2023-08-23 22:08:04 +02:00
if ( token ! = vocab . token_to_id . end ( ) ) {
2023-08-21 22:07:43 +02:00
output . push_back ( ( * token ) . second ) ;
return ;
}
const auto p = rev_merge . find ( text ) ;
if ( p = = rev_merge . end ( ) ) {
// output any symbols that did not form tokens as bytes.
for ( int j = 0 ; j < ( int ) symbol . n ; + + j ) {
2023-08-23 22:08:04 +02:00
llama_vocab : : id token_id = llama_byte_to_token ( vocab , symbol . text [ j ] ) ;
2023-08-21 22:07:43 +02:00
output . push_back ( token_id ) ;
}
return ;
}
2023-08-23 22:08:04 +02:00
resegment ( symbols [ p - > second . first ] , output ) ;
resegment ( symbols [ p - > second . second ] , output ) ;
2023-08-21 22:07:43 +02:00
}
2023-03-22 06:32:36 +01:00
void try_add_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
2023-08-23 22:08:04 +02:00
const std : : string text = std : : string ( symbols [ left ] . text , symbols [ left ] . n + symbols [ right ] . n ) ;
auto token = vocab . token_to_id . find ( text ) ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
if ( token = = vocab . token_to_id . end ( ) ) {
2023-03-22 06:32:36 +01:00
return ;
}
2023-08-23 22:08:04 +02:00
if ( static_cast < size_t > ( ( * token ) . second ) > = vocab . id_to_token . size ( ) ) {
2023-03-22 06:32:36 +01:00
return ;
}
2023-08-23 22:08:04 +02:00
const auto & tok_data = vocab . id_to_token [ ( * token ) . second ] ;
2023-03-22 06:32:36 +01:00
2023-08-23 22:08:04 +02:00
llm_bigram_spm bigram ;
bigram . left = left ;
2023-03-22 06:32:36 +01:00
bigram . right = right ;
2023-08-21 22:07:43 +02:00
bigram . score = tok_data . score ;
2023-08-23 22:08:04 +02:00
bigram . size = text . size ( ) ;
work_queue . push ( bigram ) ;
2023-08-21 22:07:43 +02:00
// Do we need to support is_unused?
rev_merge [ text ] = std : : make_pair ( left , right ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-23 22:08:04 +02:00
const llama_vocab & vocab ;
std : : vector < llm_symbol > symbols ;
llm_bigram_spm : : queue work_queue ;
std : : map < std : : string , std : : pair < int , int > > rev_merge ;
} ;
// BPE tokenizer
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
struct llm_bigram_bpe {
struct comparator {
2023-08-29 22:55:03 +02:00
bool operator ( ) ( const llm_bigram_bpe & l , const llm_bigram_bpe & r ) const {
2023-08-23 22:08:04 +02:00
return l . rank > r . rank | | ( l . rank = = r . rank & & l . left > r . left ) ;
}
} ;
using queue_storage = std : : vector < llm_bigram_bpe > ;
using queue = std : : priority_queue < llm_bigram_bpe , queue_storage , comparator > ;
llm_symbol : : index left ;
llm_symbol : : index right ;
std : : string text ;
int rank ;
size_t size ;
} ;
struct llm_tokenizer_bpe {
2023-08-26 13:45:53 +02:00
llm_tokenizer_bpe ( const llama_vocab & vocab ) : vocab ( vocab ) { }
2023-08-23 22:08:04 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_vocab : : id > & output ) {
int final_prev_index = - 1 ;
auto word_collection = bpe_gpt2_preprocess ( text ) ;
symbols_final . clear ( ) ;
for ( auto & word : word_collection ) {
work_queue = llm_bigram_bpe : : queue ( ) ;
symbols . clear ( ) ;
int index = 0 ;
size_t offset = 0 ;
while ( offset < word . size ( ) ) {
llm_symbol sym ;
size_t char_len = std : : min ( word . size ( ) - offset , ( size_t ) : : utf8_len ( word [ offset ] ) ) ;
sym . text = word . c_str ( ) + offset ;
sym . n = 1 ;
sym . n = char_len ;
offset + = sym . n ;
sym . prev = index - 1 ;
sym . next = offset = = word . size ( ) ? - 1 : index + 1 ;
index + + ;
symbols . emplace_back ( sym ) ;
}
for ( size_t i = 1 ; i < symbols . size ( ) ; + + i ) {
add_new_bigram ( i - 1 , i ) ;
}
// build token(s)
while ( ! work_queue . empty ( ) ) {
auto bigram = work_queue . top ( ) ;
work_queue . pop ( ) ;
auto & left_symbol = symbols [ bigram . left ] ;
auto & right_symbol = symbols [ bigram . right ] ;
if ( left_symbol . n = = 0 | | right_symbol . n = = 0 ) {
continue ;
}
std : : string left_token = std : : string ( left_symbol . text , left_symbol . n ) ;
std : : string right_token = std : : string ( right_symbol . text , right_symbol . n ) ;
if ( left_token + right_token ! = bigram . text ) {
continue ; // Skip this bigram if it's outdated
}
// merge the right sym into the left one
left_symbol . n + = right_symbol . n ;
right_symbol . n = 0 ;
// remove the right sym from the chain
left_symbol . next = right_symbol . next ;
if ( right_symbol . next > = 0 ) {
symbols [ right_symbol . next ] . prev = bigram . left ;
}
add_new_bigram ( left_symbol . prev , bigram . left ) ; // left side of current symbol
add_new_bigram ( bigram . left , left_symbol . next ) ; // right side of current symbol
}
// add the fnished tokens to the final list keeping correct order for next and prev
for ( auto & sym : symbols ) {
if ( sym . n > 0 ) {
sym . prev = final_prev_index ;
sym . next = - 1 ;
if ( final_prev_index ! = - 1 ) {
symbols_final [ final_prev_index ] . next = symbols_final . size ( ) ;
}
symbols_final . emplace_back ( sym ) ;
final_prev_index = symbols_final . size ( ) - 1 ;
}
}
}
symbols = symbols_final ;
if ( ! symbols . empty ( ) ) {
for ( int i = 0 ; i ! = - 1 ; i = symbols [ i ] . next ) {
auto & symbol = symbols [ i ] ;
if ( symbol . n = = 0 ) {
continue ;
}
const std : : string str = std : : string ( symbol . text , symbol . n ) ;
const auto token = vocab . token_to_id . find ( str ) ;
if ( token = = vocab . token_to_id . end ( ) ) {
for ( auto j = str . begin ( ) ; j ! = str . end ( ) ; + + j ) {
std : : string byte_str ( 1 , * j ) ;
auto token_multibyte = vocab . token_to_id . find ( byte_str ) ;
if ( token_multibyte = = vocab . token_to_id . end ( ) ) {
2023-10-03 09:16:26 +02:00
throw std : : runtime_error ( " ERROR: byte not found in vocab " ) ;
2023-08-23 22:08:04 +02:00
}
2023-10-03 09:16:26 +02:00
output . push_back ( ( * token_multibyte ) . second ) ;
2023-08-23 22:08:04 +02:00
}
} else {
output . push_back ( ( * token ) . second ) ;
}
}
}
}
private :
void add_new_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
std : : string left_token = std : : string ( symbols [ left ] . text , symbols [ left ] . n ) ;
std : : string right_token = std : : string ( symbols [ right ] . text , symbols [ right ] . n ) ;
int rank_found = - 1 ;
rank_found = vocab . find_bpe_rank ( left_token , right_token ) ;
if ( rank_found < 0 ) {
return ;
}
llm_bigram_bpe bigram ;
bigram . left = left ;
bigram . right = right ;
bigram . text = left_token + right_token ;
bigram . size = left_token . size ( ) + right_token . size ( ) ;
bigram . rank = rank_found ;
work_queue . push ( bigram ) ;
}
2023-10-03 09:16:26 +02:00
std : : vector < std : : string > bpe_gpt2_preprocess ( const std : : string & text ) {
std : : vector < std : : string > bpe_words ;
std : : vector < std : : string > bpe_encoded_words ;
std : : string token = " " ;
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
bool collecting_numeric = false ;
bool collecting_letter = false ;
bool collecting_special = false ;
bool collecting_whitespace_lookahead = false ;
bool collecting = false ;
std : : vector < std : : string > text_utf ;
text_utf . reserve ( text . size ( ) ) ;
bpe_words . reserve ( text . size ( ) ) ;
bpe_encoded_words . reserve ( text . size ( ) ) ;
auto cps = codepoints_from_utf8 ( text ) ;
for ( size_t i = 0 ; i < cps . size ( ) ; + + i )
text_utf . emplace_back ( codepoint_to_utf8 ( cps [ i ] ) ) ;
for ( int i = 0 ; i < ( int ) text_utf . size ( ) ; i + + ) {
const std : : string & utf_char = text_utf [ i ] ;
bool split_condition = false ;
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
int bytes_remain = text_utf . size ( ) - i ;
// forward backward lookups
const std : : string & utf_char_next = ( i + 1 < ( int ) text_utf . size ( ) ) ? text_utf [ i + 1 ] : " " ;
const std : : string & utf_char_next_next = ( i + 2 < ( int ) text_utf . size ( ) ) ? text_utf [ i + 2 ] : " " ;
// handling contractions
if ( ! split_condition & & bytes_remain > = 2 ) {
// 's|'t|'m|'d
if ( utf_char = = " \' " & & ( utf_char_next = = " s " | | utf_char_next = = " t " | | utf_char_next = = " m " | | utf_char_next = = " d " ) ) {
split_condition = true ;
}
if ( split_condition ) {
if ( token . size ( ) ) {
bpe_words . emplace_back ( token ) ; // push previous content as token
}
token = utf_char + utf_char_next ;
bpe_words . emplace_back ( token ) ;
token = " " ;
i + + ;
continue ;
}
}
if ( ! split_condition & & bytes_remain > = 3 ) {
// 're|'ve|'ll
if ( utf_char = = " \' " & & (
( utf_char_next = = " r " | | utf_char_next_next = = " e " ) | |
( utf_char_next = = " v " | | utf_char_next_next = = " e " ) | |
( utf_char_next = = " l " | | utf_char_next_next = = " l " ) )
) {
split_condition = true ;
}
if ( split_condition ) {
// current token + next token can be defined
if ( token . size ( ) ) {
bpe_words . emplace_back ( token ) ; // push previous content as token
}
token = utf_char + utf_char_next + utf_char_next_next ;
bpe_words . emplace_back ( token ) ; // the contraction
token = " " ;
i + = 2 ;
continue ;
}
}
if ( ! split_condition & & ! collecting ) {
if ( codepoint_type ( utf_char ) = = CODEPOINT_TYPE_LETTER | | ( ! token . size ( ) & & utf_char = = " " & & codepoint_type ( utf_char_next ) = = CODEPOINT_TYPE_LETTER ) ) {
collecting_letter = true ;
collecting = true ;
}
else if ( codepoint_type ( utf_char ) = = CODEPOINT_TYPE_DIGIT | | ( ! token . size ( ) & & utf_char = = " " & & codepoint_type ( utf_char_next ) = = CODEPOINT_TYPE_DIGIT ) ) {
collecting_numeric = true ;
collecting = true ;
}
else if (
( ( codepoint_type ( utf_char ) ! = CODEPOINT_TYPE_LETTER & & codepoint_type ( utf_char ) ! = CODEPOINT_TYPE_DIGIT ) & & ( codepoint_type ( utf_char ) ! = CODEPOINT_TYPE_WHITESPACE ) ) | |
( ! token . size ( ) & & utf_char = = " " & & codepoint_type ( utf_char_next ) ! = CODEPOINT_TYPE_LETTER & & codepoint_type ( utf_char_next ) ! = CODEPOINT_TYPE_DIGIT & & codepoint_type ( utf_char_next ) ! = CODEPOINT_TYPE_WHITESPACE )
) {
collecting_special = true ;
collecting = true ;
}
else if ( codepoint_type ( utf_char ) = = CODEPOINT_TYPE_WHITESPACE & & codepoint_type ( utf_char_next ) = = CODEPOINT_TYPE_WHITESPACE ) {
collecting_whitespace_lookahead = true ;
collecting = true ;
}
else if ( codepoint_type ( utf_char ) = = CODEPOINT_TYPE_WHITESPACE ) {
split_condition = true ;
}
}
else if ( ! split_condition & & collecting ) {
if ( collecting_letter & & codepoint_type ( utf_char ) ! = CODEPOINT_TYPE_LETTER ) {
split_condition = true ;
}
else if ( collecting_numeric & & codepoint_type ( utf_char ) ! = CODEPOINT_TYPE_DIGIT ) {
split_condition = true ;
}
else if ( collecting_special & & ( codepoint_type ( utf_char ) = = CODEPOINT_TYPE_LETTER | | codepoint_type ( utf_char ) = = CODEPOINT_TYPE_DIGIT | | codepoint_type ( utf_char ) = = CODEPOINT_TYPE_WHITESPACE ) ) {
split_condition = true ;
}
else if ( collecting_whitespace_lookahead & & codepoint_type ( utf_char_next ) ! = CODEPOINT_TYPE_WHITESPACE ) {
split_condition = true ;
}
}
if ( utf_char_next = = " " ) {
split_condition = true ; // final
token + = utf_char ;
}
2023-08-23 22:08:04 +02:00
2023-10-03 09:16:26 +02:00
if ( split_condition ) {
if ( token . size ( ) ) {
bpe_words . emplace_back ( token ) ;
}
token = utf_char ;
collecting = false ;
collecting_letter = false ;
collecting_numeric = false ;
collecting_special = false ;
collecting_whitespace_lookahead = false ;
}
else {
token + = utf_char ;
}
}
2023-08-23 22:08:04 +02:00
2023-10-03 09:16:26 +02:00
for ( std : : string & word : bpe_words ) {
std : : string encoded_token = " " ;
for ( char & c : word ) {
encoded_token + = bytes_to_unicode_bpe ( c ) ;
}
bpe_encoded_words . emplace_back ( encoded_token ) ;
2023-08-23 22:08:04 +02:00
}
2023-08-29 22:55:03 +02:00
2023-10-03 09:16:26 +02:00
return bpe_encoded_words ;
2023-08-23 22:08:04 +02:00
}
const llama_vocab & vocab ;
std : : vector < llm_symbol > symbols ;
std : : vector < llm_symbol > symbols_final ;
llm_bigram_bpe : : queue work_queue ;
2023-03-22 06:32:36 +01:00
} ;
2023-08-27 13:19:19 +02:00
static std : : vector < llama_vocab : : id > llama_tokenize_internal ( const llama_vocab & vocab , std : : string raw_text , bool bos ) {
2023-03-22 06:32:36 +01:00
std : : vector < llama_vocab : : id > output ;
2023-08-27 13:19:19 +02:00
// OG tokenizer behavior:
//
// tokenizer.encode('', add_bos=True) returns [1]
// tokenizer.encode('', add_bos=False) returns []
2023-03-22 06:32:36 +01:00
2023-08-26 13:45:53 +02:00
if ( bos & & vocab . special_bos_id ! = - 1 ) {
output . push_back ( vocab . special_bos_id ) ;
}
2023-08-27 13:19:19 +02:00
if ( raw_text . empty ( ) ) {
return output ;
}
2023-08-23 22:08:04 +02:00
switch ( vocab . type ) {
case LLAMA_VOCAB_TYPE_SPM :
{
2023-08-27 13:19:19 +02:00
// without adding this leading whitespace, we do not get the same results as the original tokenizer
raw_text = " " + raw_text ;
2023-08-23 22:08:04 +02:00
llm_tokenizer_spm tokenizer ( vocab ) ;
2023-08-27 13:19:19 +02:00
llama_escape_whitespace ( raw_text ) ;
tokenizer . tokenize ( raw_text , output ) ;
2023-08-23 22:08:04 +02:00
} break ;
case LLAMA_VOCAB_TYPE_BPE :
{
2023-08-26 13:45:53 +02:00
llm_tokenizer_bpe tokenizer ( vocab ) ;
2023-08-23 22:08:04 +02:00
tokenizer . tokenize ( raw_text , output ) ;
} break ;
2023-09-28 23:41:44 +02:00
}
2023-03-22 06:32:36 +01:00
return output ;
}
2023-07-24 05:58:10 +02:00
//
// grammar - internal
//
2023-08-18 01:54:44 +02:00
struct llama_partial_utf8 {
uint32_t value ; // bit value so far (unshifted)
int n_remain ; // num bytes remaining; -1 indicates invalid sequence
} ;
2023-07-24 05:58:10 +02:00
struct llama_grammar {
const std : : vector < std : : vector < llama_grammar_element > > rules ;
std : : vector < std : : vector < const llama_grammar_element * > > stacks ;
2023-08-18 01:54:44 +02:00
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8 ;
2023-07-24 05:58:10 +02:00
} ;
struct llama_grammar_candidate {
2023-08-18 01:54:44 +02:00
size_t index ;
const uint32_t * code_points ;
llama_partial_utf8 partial_utf8 ;
2023-07-24 05:58:10 +02:00
} ;
2023-08-18 01:54:44 +02:00
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
2023-09-15 21:38:27 +02:00
static std : : pair < std : : vector < uint32_t > , llama_partial_utf8 > decode_utf8 (
2023-08-18 01:54:44 +02:00
const char * src ,
llama_partial_utf8 partial_start ) {
static const int lookup [ ] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 2 , 2 , 3 , 4 } ;
2023-07-24 05:58:10 +02:00
const char * pos = src ;
std : : vector < uint32_t > code_points ;
2023-08-18 01:54:44 +02:00
uint32_t value = partial_start . value ;
int n_remain = partial_start . n_remain ;
// continue previous decode, if applicable
while ( * pos ! = 0 & & n_remain > 0 ) {
uint8_t next_byte = static_cast < uint8_t > ( * pos ) ;
if ( ( next_byte > > 6 ) ! = 2 ) {
// invalid sequence, abort
code_points . push_back ( 0 ) ;
return std : : make_pair ( std : : move ( code_points ) , llama_partial_utf8 { 0 , - 1 } ) ;
}
value = ( value < < 6 ) + ( next_byte & 0x3F ) ;
+ + pos ;
- - n_remain ;
}
if ( partial_start . n_remain > 0 & & n_remain = = 0 ) {
code_points . push_back ( value ) ;
}
// decode any subsequent utf-8 sequences, which may end in an incomplete one
2023-07-24 05:58:10 +02:00
while ( * pos ! = 0 ) {
uint8_t first_byte = static_cast < uint8_t > ( * pos ) ;
uint8_t highbits = first_byte > > 4 ;
2023-08-18 01:54:44 +02:00
n_remain = lookup [ highbits ] - 1 ;
if ( n_remain < 0 ) {
// invalid sequence, abort
code_points . clear ( ) ;
code_points . push_back ( 0 ) ;
return std : : make_pair ( std : : move ( code_points ) , llama_partial_utf8 { 0 , n_remain } ) ;
}
uint8_t mask = ( 1 < < ( 7 - n_remain ) ) - 1 ;
value = first_byte & mask ;
2023-07-24 05:58:10 +02:00
+ + pos ;
2023-08-18 01:54:44 +02:00
while ( * pos ! = 0 & & n_remain > 0 ) {
2023-07-24 05:58:10 +02:00
value = ( value < < 6 ) + ( static_cast < uint8_t > ( * pos ) & 0x3F ) ;
2023-08-18 01:54:44 +02:00
+ + pos ;
- - n_remain ;
}
if ( n_remain = = 0 ) {
code_points . push_back ( value ) ;
2023-07-24 05:58:10 +02:00
}
}
code_points . push_back ( 0 ) ;
2023-08-18 01:54:44 +02:00
return std : : make_pair ( std : : move ( code_points ) , llama_partial_utf8 { value , n_remain } ) ;
2023-07-24 05:58:10 +02:00
}
// returns true iff pos points to the end of one of the definitions of a rule
static bool llama_grammar_is_end_of_sequence ( const llama_grammar_element * pos ) {
switch ( pos - > type ) {
2023-08-21 22:07:43 +02:00
case LLAMA_GRETYPE_END : return true ; // NOLINT
case LLAMA_GRETYPE_ALT : return true ; // NOLINT
2023-07-24 05:58:10 +02:00
default : return false ;
}
}
// returns true iff chr satisfies the char range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static std : : pair < bool , const llama_grammar_element * > llama_grammar_match_char (
const llama_grammar_element * pos ,
const uint32_t chr ) {
bool found = false ;
bool is_positive_char = pos - > type = = LLAMA_GRETYPE_CHAR ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( is_positive_char | | pos - > type = = LLAMA_GRETYPE_CHAR_NOT ) ; // NOLINT
2023-07-24 05:58:10 +02:00
do {
if ( pos [ 1 ] . type = = LLAMA_GRETYPE_CHAR_RNG_UPPER ) {
// inclusive range, e.g. [a-z]
found = found | | ( pos - > value < = chr & & chr < = pos [ 1 ] . value ) ;
pos + = 2 ;
} else {
// exact char match, e.g. [a] or "a"
found = found | | pos - > value = = chr ;
pos + = 1 ;
}
} while ( pos - > type = = LLAMA_GRETYPE_CHAR_ALT ) ;
return std : : make_pair ( found = = is_positive_char , pos ) ;
}
2023-08-18 01:54:44 +02:00
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
// range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static bool llama_grammar_match_partial_char (
const llama_grammar_element * pos ,
const llama_partial_utf8 partial_utf8 ) {
bool is_positive_char = pos - > type = = LLAMA_GRETYPE_CHAR ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( is_positive_char | | pos - > type = = LLAMA_GRETYPE_CHAR_NOT ) ;
2023-08-18 01:54:44 +02:00
uint32_t partial_value = partial_utf8 . value ;
int n_remain = partial_utf8 . n_remain ;
// invalid sequence or 7-bit char split across 2 bytes (overlong)
if ( n_remain < 0 | | ( n_remain = = 1 & & partial_value < 2 ) ) {
return false ;
}
// range of possible code points this partial UTF-8 sequence could complete to
uint32_t low = partial_value < < ( n_remain * 6 ) ;
uint32_t high = low | ( ( 1 < < ( n_remain * 6 ) ) - 1 ) ;
if ( low = = 0 ) {
if ( n_remain = = 2 ) {
low = 1 < < 11 ;
} else if ( n_remain = = 3 ) {
low = 1 < < 16 ;
}
}
do {
if ( pos [ 1 ] . type = = LLAMA_GRETYPE_CHAR_RNG_UPPER ) {
// inclusive range, e.g. [a-z]
if ( pos - > value < = high & & low < = pos [ 1 ] . value ) {
return is_positive_char ;
}
pos + = 2 ;
} else {
// exact char match, e.g. [a] or "a"
if ( low < = pos - > value & & pos - > value < = high ) {
return is_positive_char ;
}
pos + = 1 ;
}
} while ( pos - > type = = LLAMA_GRETYPE_CHAR_ALT ) ;
return ! is_positive_char ;
}
2023-07-24 05:58:10 +02:00
// transforms a grammar pushdown stack into N possible stacks, all ending
// at a character range (terminal element)
static void llama_grammar_advance_stack (
const std : : vector < std : : vector < llama_grammar_element > > & rules ,
const std : : vector < const llama_grammar_element * > & stack ,
std : : vector < std : : vector < const llama_grammar_element * > > & new_stacks ) {
if ( stack . empty ( ) ) {
2023-09-01 15:34:50 +02:00
new_stacks . emplace_back ( stack ) ;
2023-07-24 05:58:10 +02:00
return ;
}
const llama_grammar_element * pos = stack . back ( ) ;
switch ( pos - > type ) {
case LLAMA_GRETYPE_RULE_REF : {
const size_t rule_id = static_cast < size_t > ( pos - > value ) ;
const llama_grammar_element * subpos = rules [ rule_id ] . data ( ) ;
do {
// init new stack without the top (pos)
std : : vector < const llama_grammar_element * > new_stack ( stack . begin ( ) , stack . end ( ) - 1 ) ;
if ( ! llama_grammar_is_end_of_sequence ( pos + 1 ) ) {
// if this rule ref is followed by another element, add that to stack
new_stack . push_back ( pos + 1 ) ;
}
if ( ! llama_grammar_is_end_of_sequence ( subpos ) ) {
// if alternate is nonempty, add to stack
new_stack . push_back ( subpos ) ;
}
llama_grammar_advance_stack ( rules , new_stack , new_stacks ) ;
while ( ! llama_grammar_is_end_of_sequence ( subpos ) ) {
// scan to end of alternate def
subpos + + ;
}
if ( subpos - > type = = LLAMA_GRETYPE_ALT ) {
// there's another alternate def of this rule to process
subpos + + ;
} else {
break ;
}
} while ( true ) ;
break ;
}
case LLAMA_GRETYPE_CHAR :
case LLAMA_GRETYPE_CHAR_NOT :
2023-09-01 15:34:50 +02:00
new_stacks . emplace_back ( stack ) ;
2023-07-24 05:58:10 +02:00
break ;
default :
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( false ) ;
2023-07-24 05:58:10 +02:00
}
}
// takes a set of possible pushdown stacks on a grammar, which are required to
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
static std : : vector < std : : vector < const llama_grammar_element * > > llama_grammar_accept (
const std : : vector < std : : vector < llama_grammar_element > > & rules ,
const std : : vector < std : : vector < const llama_grammar_element * > > & stacks ,
const uint32_t chr ) {
std : : vector < std : : vector < const llama_grammar_element * > > new_stacks ;
for ( const auto & stack : stacks ) {
if ( stack . empty ( ) ) {
continue ;
}
auto match = llama_grammar_match_char ( stack . back ( ) , chr ) ;
if ( match . first ) {
const llama_grammar_element * pos = match . second ;
// update top of stack to next element, if any
std : : vector < const llama_grammar_element * > new_stack ( stack . begin ( ) , stack . end ( ) - 1 ) ;
if ( ! llama_grammar_is_end_of_sequence ( pos ) ) {
new_stack . push_back ( pos ) ;
}
llama_grammar_advance_stack ( rules , new_stack , new_stacks ) ;
}
}
return new_stacks ;
}
static std : : vector < llama_grammar_candidate > llama_grammar_reject_candidates (
const std : : vector < std : : vector < llama_grammar_element > > & rules ,
const std : : vector < std : : vector < const llama_grammar_element * > > & stacks ,
const std : : vector < llama_grammar_candidate > & candidates ) ;
static std : : vector < llama_grammar_candidate > llama_grammar_reject_candidates_for_stack (
const std : : vector < std : : vector < llama_grammar_element > > & rules ,
const std : : vector < const llama_grammar_element * > & stack ,
const std : : vector < llama_grammar_candidate > & candidates ) {
std : : vector < llama_grammar_candidate > rejects ;
if ( stack . empty ( ) ) {
2023-08-18 01:54:44 +02:00
for ( auto tok : candidates ) {
if ( * tok . code_points ! = 0 | | tok . partial_utf8 . n_remain ! = 0 ) {
rejects . push_back ( tok ) ;
}
}
2023-07-24 05:58:10 +02:00
return rejects ;
}
const llama_grammar_element * stack_pos = stack . back ( ) ;
std : : vector < llama_grammar_candidate > next_candidates ;
for ( auto tok : candidates ) {
2023-08-18 01:54:44 +02:00
if ( * tok . code_points = = 0 ) {
// reached end of full codepoints in token, reject iff it ended in a partial sequence
// that cannot satisfy this position in grammar
if ( tok . partial_utf8 . n_remain ! = 0 & &
! llama_grammar_match_partial_char ( stack_pos , tok . partial_utf8 ) ) {
rejects . push_back ( tok ) ;
2023-07-24 05:58:10 +02:00
}
2023-08-18 01:54:44 +02:00
} else if ( llama_grammar_match_char ( stack_pos , * tok . code_points ) . first ) {
next_candidates . push_back ( { tok . index , tok . code_points + 1 , tok . partial_utf8 } ) ;
2023-07-24 05:58:10 +02:00
} else {
rejects . push_back ( tok ) ;
}
}
2023-08-21 22:07:43 +02:00
const auto * stack_pos_after = llama_grammar_match_char ( stack_pos , 0 ) . second ;
2023-07-24 05:58:10 +02:00
// update top of stack to next element, if any
std : : vector < const llama_grammar_element * > stack_after ( stack . begin ( ) , stack . end ( ) - 1 ) ;
if ( ! llama_grammar_is_end_of_sequence ( stack_pos_after ) ) {
stack_after . push_back ( stack_pos_after ) ;
}
std : : vector < std : : vector < const llama_grammar_element * > > next_stacks ;
llama_grammar_advance_stack ( rules , stack_after , next_stacks ) ;
auto next_rejects = llama_grammar_reject_candidates ( rules , next_stacks , next_candidates ) ;
for ( auto tok : next_rejects ) {
2023-08-18 01:54:44 +02:00
rejects . push_back ( { tok . index , tok . code_points - 1 , tok . partial_utf8 } ) ;
2023-07-24 05:58:10 +02:00
}
return rejects ;
}
static std : : vector < llama_grammar_candidate > llama_grammar_reject_candidates (
const std : : vector < std : : vector < llama_grammar_element > > & rules ,
const std : : vector < std : : vector < const llama_grammar_element * > > & stacks ,
const std : : vector < llama_grammar_candidate > & candidates ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ! stacks . empty ( ) ) ; // REVIEW
2023-07-24 05:58:10 +02:00
if ( candidates . empty ( ) ) {
return std : : vector < llama_grammar_candidate > ( ) ;
}
auto rejects = llama_grammar_reject_candidates_for_stack ( rules , stacks . front ( ) , candidates ) ;
for ( size_t i = 1 , size = stacks . size ( ) ; i < size ; + + i ) {
rejects = llama_grammar_reject_candidates_for_stack ( rules , stacks [ i ] , rejects ) ;
}
return rejects ;
}
//
// grammar - external
//
struct llama_grammar * llama_grammar_init (
const llama_grammar_element * * rules ,
size_t n_rules ,
size_t start_rule_index ) {
const llama_grammar_element * pos ;
// copy rule definitions into vectors
std : : vector < std : : vector < llama_grammar_element > > vec_rules ( n_rules ) ;
for ( size_t i = 0 ; i < n_rules ; i + + ) {
for ( pos = rules [ i ] ; pos - > type ! = LLAMA_GRETYPE_END ; pos + + ) {
vec_rules [ i ] . push_back ( * pos ) ;
}
vec_rules [ i ] . push_back ( { LLAMA_GRETYPE_END , 0 } ) ;
}
// loop over alternates of start rule to build initial stacks
std : : vector < std : : vector < const llama_grammar_element * > > stacks ;
pos = rules [ start_rule_index ] ;
do {
std : : vector < const llama_grammar_element * > stack ;
if ( ! llama_grammar_is_end_of_sequence ( pos ) ) {
// if alternate is nonempty, add to stack
stack . push_back ( pos ) ;
}
llama_grammar_advance_stack ( vec_rules , stack , stacks ) ;
while ( ! llama_grammar_is_end_of_sequence ( pos ) ) {
// scan to end of alternate def
pos + + ;
}
if ( pos - > type = = LLAMA_GRETYPE_ALT ) {
// there's another alternate def of this rule to process
pos + + ;
} else {
break ;
}
} while ( true ) ;
2023-08-18 01:54:44 +02:00
return new llama_grammar { std : : move ( vec_rules ) , std : : move ( stacks ) , { } } ;
2023-07-24 05:58:10 +02:00
}
void llama_grammar_free ( struct llama_grammar * grammar ) {
delete grammar ;
}
2023-09-05 07:46:17 +02:00
struct llama_grammar * llama_grammar_copy ( const struct llama_grammar * grammar ) {
llama_grammar * result = new llama_grammar { grammar - > rules , grammar - > stacks , grammar - > partial_utf8 } ;
// redirect elements in stacks to point to new rules
for ( size_t is = 0 ; is < result - > stacks . size ( ) ; is + + ) {
for ( size_t ie = 0 ; ie < result - > stacks [ is ] . size ( ) ; ie + + ) {
for ( size_t ir0 = 0 ; ir0 < grammar - > rules . size ( ) ; ir0 + + ) {
for ( size_t ir1 = 0 ; ir1 < grammar - > rules [ ir0 ] . size ( ) ; ir1 + + ) {
if ( grammar - > stacks [ is ] [ ie ] = = & grammar - > rules [ ir0 ] [ ir1 ] ) {
result - > stacks [ is ] [ ie ] = & result - > rules [ ir0 ] [ ir1 ] ;
}
}
}
}
}
return result ;
}
2023-03-22 06:32:36 +01:00
//
// sampling
//
2023-09-28 18:04:36 +02:00
void llama_set_rng_seed ( struct llama_context * ctx , uint32_t seed ) {
if ( seed = = LLAMA_DEFAULT_SEED ) {
seed = time ( NULL ) ;
}
ctx - > rng . seed ( seed ) ;
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
void llama_sample_softmax ( struct llama_context * ctx , llama_token_data_array * candidates ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( candidates - > size > 0 ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-03-22 06:32:36 +01:00
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Sort the logits in descending order
if ( ! candidates - > sorted ) {
std : : sort ( candidates - > data , candidates - > data + candidates - > size , [ ] ( const llama_token_data & a , const llama_token_data & b ) {
return a . logit > b . logit ;
} ) ;
candidates - > sorted = true ;
}
float max_l = candidates - > data [ 0 ] . logit ;
float cum_sum = 0.0f ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
float p = expf ( candidates - > data [ i ] . logit - max_l ) ;
candidates - > data [ i ] . p = p ;
cum_sum + = p ;
}
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
candidates - > data [ i ] . p / = cum_sum ;
}
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
2023-03-22 06:32:36 +01:00
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
void llama_sample_top_k ( struct llama_context * ctx , llama_token_data_array * candidates , int k , size_t min_keep ) {
const int64_t t_start_sample_us = ggml_time_us ( ) ;
k = std : : max ( k , ( int ) min_keep ) ;
k = std : : min ( k , ( int ) candidates - > size ) ;
// Sort scores in descending order
if ( ! candidates - > sorted ) {
auto comp = [ ] ( const llama_token_data & a , const llama_token_data & b ) {
return a . logit > b . logit ;
} ;
if ( k = = ( int ) candidates - > size ) {
std : : sort ( candidates - > data , candidates - > data + candidates - > size , comp ) ;
} else {
std : : partial_sort ( candidates - > data , candidates - > data + k , candidates - > data + candidates - > size , comp ) ;
2023-04-03 02:19:04 +02:00
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
candidates - > sorted = true ;
2023-04-03 02:19:04 +02:00
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
candidates - > size = k ;
2023-04-03 02:19:04 +02:00
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-03-22 06:32:36 +01:00
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
void llama_sample_top_p ( struct llama_context * ctx , llama_token_data_array * candidates , float p , size_t min_keep ) {
if ( p > = 1.0f ) {
return ;
}
llama_sample_softmax ( ctx , candidates ) ;
2023-07-05 12:31:23 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Compute the cumulative probabilities
float cum_sum = 0.0f ;
size_t last_idx = candidates - > size ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
cum_sum + = candidates - > data [ i ] . p ;
2023-06-24 12:15:01 +02:00
// Check if the running sum is at least p or if we have kept at least min_keep tokens
// we set the last index to i+1 to indicate that the current iterate should be included in the set
if ( cum_sum > = p & & i + 1 > = min_keep ) {
last_idx = i + 1 ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
break ;
2023-03-22 06:32:36 +01:00
}
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Resize the output vector to keep only the top-p tokens
candidates - > size = last_idx ;
2023-03-22 06:32:36 +01:00
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-03-22 06:32:36 +01:00
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
void llama_sample_tail_free ( struct llama_context * ctx , llama_token_data_array * candidates , float z , size_t min_keep ) {
if ( z > = 1.0f | | candidates - > size < = 2 ) {
return ;
2023-03-22 06:32:36 +01:00
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
llama_sample_softmax ( nullptr , candidates ) ;
2023-07-05 12:31:23 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Compute the first and second derivatives
std : : vector < float > first_derivatives ( candidates - > size - 1 ) ;
std : : vector < float > second_derivatives ( candidates - > size - 2 ) ;
for ( size_t i = 0 ; i < first_derivatives . size ( ) ; + + i ) {
first_derivatives [ i ] = candidates - > data [ i ] . p - candidates - > data [ i + 1 ] . p ;
}
for ( size_t i = 0 ; i < second_derivatives . size ( ) ; + + i ) {
second_derivatives [ i ] = first_derivatives [ i ] - first_derivatives [ i + 1 ] ;
2023-03-22 06:32:36 +01:00
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Calculate absolute value of second derivatives
for ( size_t i = 0 ; i < second_derivatives . size ( ) ; + + i ) {
2023-08-26 18:53:52 +02:00
second_derivatives [ i ] = std : : abs ( second_derivatives [ i ] ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
}
// Normalize the second derivatives
2023-07-18 13:24:43 +02:00
{
const float second_derivatives_sum = std : : accumulate ( second_derivatives . begin ( ) , second_derivatives . end ( ) , 0.0f ) ;
if ( second_derivatives_sum > 1e-6 f ) {
for ( float & value : second_derivatives ) {
value / = second_derivatives_sum ;
}
} else {
for ( float & value : second_derivatives ) {
value = 1.0f / second_derivatives . size ( ) ;
}
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
}
float cum_sum = 0.0f ;
size_t last_idx = candidates - > size ;
for ( size_t i = 0 ; i < second_derivatives . size ( ) ; + + i ) {
cum_sum + = second_derivatives [ i ] ;
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
if ( cum_sum > z & & i > = min_keep ) {
last_idx = i ;
break ;
2023-03-22 06:32:36 +01:00
}
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Resize the output vector to keep only the tokens above the tail location
candidates - > size = last_idx ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
void llama_sample_typical ( struct llama_context * ctx , llama_token_data_array * candidates , float p , size_t min_keep ) {
// Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
if ( p > = 1.0f ) {
return ;
}
// Compute the softmax of logits and calculate entropy
llama_sample_softmax ( nullptr , candidates ) ;
2023-07-05 12:31:23 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
float entropy = 0.0f ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
entropy + = - candidates - > data [ i ] . p * logf ( candidates - > data [ i ] . p ) ;
}
// Compute the absolute difference between negative log probability and entropy for each candidate
std : : vector < float > shifted_scores ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
float shifted_score = fabsf ( - logf ( candidates - > data [ i ] . p ) - entropy ) ;
shifted_scores . push_back ( shifted_score ) ;
}
// Sort tokens based on the shifted_scores and their corresponding indices
std : : vector < size_t > indices ( candidates - > size ) ;
std : : iota ( indices . begin ( ) , indices . end ( ) , 0 ) ;
std : : sort ( indices . begin ( ) , indices . end ( ) , [ & ] ( size_t a , size_t b ) {
return shifted_scores [ a ] < shifted_scores [ b ] ;
} ) ;
// Compute the cumulative probabilities
float cum_sum = 0.0f ;
size_t last_idx = indices . size ( ) ;
for ( size_t i = 0 ; i < indices . size ( ) ; + + i ) {
size_t idx = indices [ i ] ;
cum_sum + = candidates - > data [ idx ] . p ;
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
if ( cum_sum > p & & i > = min_keep - 1 ) {
last_idx = i + 1 ;
break ;
}
}
// Resize the output vector to keep only the locally typical tokens
std : : vector < llama_token_data > new_candidates ;
for ( size_t i = 0 ; i < last_idx ; + + i ) {
size_t idx = indices [ i ] ;
new_candidates . push_back ( candidates - > data [ idx ] ) ;
}
// Replace the data in candidates with the new_candidates data
std : : copy ( new_candidates . begin ( ) , new_candidates . end ( ) , candidates - > data ) ;
candidates - > size = new_candidates . size ( ) ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-09-28 18:04:36 +02:00
void llama_sample_temp ( struct llama_context * ctx , llama_token_data_array * candidates_p , float temp ) {
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
for ( size_t i = 0 ; i < candidates_p - > size ; + + i ) {
candidates_p - > data [ i ] . logit / = temp ;
}
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-09-28 18:04:36 +02:00
void llama_sample_temperature ( struct llama_context * ctx , llama_token_data_array * candidates_p , float temp ) {
llama_sample_temp ( ctx , candidates_p , temp ) ;
}
2023-05-02 22:09:08 +02:00
void llama_sample_repetition_penalty ( struct llama_context * ctx , llama_token_data_array * candidates , const llama_token * last_tokens , size_t last_tokens_size , float penalty ) {
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( last_tokens_size = = 0 | | penalty = = 1.0f ) {
return ;
}
const int64_t t_start_sample_us = ggml_time_us ( ) ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
2023-05-13 10:23:15 +02:00
const auto * token_iter = std : : find ( last_tokens , last_tokens + last_tokens_size , candidates - > data [ i ] . id ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( token_iter = = last_tokens + last_tokens_size ) {
continue ;
}
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if ( candidates - > data [ i ] . logit < = 0 ) {
candidates - > data [ i ] . logit * = penalty ;
} else {
candidates - > data [ i ] . logit / = penalty ;
}
}
candidates - > sorted = false ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-05-02 22:09:08 +02:00
void llama_sample_frequency_and_presence_penalties ( struct llama_context * ctx , llama_token_data_array * candidates , const llama_token * last_tokens_p , size_t last_tokens_size , float alpha_frequency , float alpha_presence ) {
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( last_tokens_size = = 0 | | ( alpha_frequency = = 0.0f & & alpha_presence = = 0.0f ) ) {
return ;
}
const int64_t t_start_sample_us = ggml_time_us ( ) ;
// Create a frequency map to count occurrences of each token in last_tokens
std : : unordered_map < llama_token , int > token_count ;
for ( size_t i = 0 ; i < last_tokens_size ; + + i ) {
token_count [ last_tokens_p [ i ] ] + + ;
}
// Apply frequency and presence penalties to the candidates
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
auto token_iter = token_count . find ( candidates - > data [ i ] . id ) ;
if ( token_iter = = token_count . end ( ) ) {
continue ;
}
int count = token_iter - > second ;
candidates - > data [ i ] . logit - = float ( count ) * alpha_frequency + float ( count > 0 ) * alpha_presence ;
}
candidates - > sorted = false ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
2023-07-24 05:58:10 +02:00
void llama_sample_grammar ( struct llama_context * ctx , llama_token_data_array * candidates , const struct llama_grammar * grammar ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx ) ;
2023-07-24 05:58:10 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
bool allow_eos = false ;
for ( const auto & stack : grammar - > stacks ) {
if ( stack . empty ( ) ) {
allow_eos = true ;
break ;
}
}
2023-08-21 22:07:43 +02:00
const llama_token eos = llama_token_eos ( ctx ) ;
2023-07-24 05:58:10 +02:00
2023-08-18 01:54:44 +02:00
std : : vector < std : : pair < std : : vector < uint32_t > , llama_partial_utf8 > > candidates_decoded ;
std : : vector < llama_grammar_candidate > candidates_grammar ;
2023-07-24 05:58:10 +02:00
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
2023-08-27 13:19:19 +02:00
const llama_token id = candidates - > data [ i ] . id ;
const std : : string piece = llama_token_to_str ( ctx , id ) ;
2023-07-24 05:58:10 +02:00
if ( id = = eos ) {
if ( ! allow_eos ) {
candidates - > data [ i ] . logit = - INFINITY ;
}
2023-08-27 13:19:19 +02:00
} else if ( piece . empty ( ) | | piece [ 0 ] = = 0 ) {
2023-07-24 05:58:10 +02:00
candidates - > data [ i ] . logit = - INFINITY ;
} else {
2023-08-27 13:19:19 +02:00
candidates_decoded . push_back ( decode_utf8 ( piece . c_str ( ) , grammar - > partial_utf8 ) ) ;
2023-08-21 22:07:43 +02:00
candidates_grammar . push_back ( { i , candidates_decoded . back ( ) . first . data ( ) , candidates_decoded . back ( ) . second } ) ;
2023-07-24 05:58:10 +02:00
}
}
2023-08-21 22:07:43 +02:00
const auto rejects = llama_grammar_reject_candidates ( grammar - > rules , grammar - > stacks , candidates_grammar ) ;
for ( const auto & reject : rejects ) {
2023-07-24 05:58:10 +02:00
candidates - > data [ reject . index ] . logit = - INFINITY ;
}
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
2023-07-11 18:18:43 +02:00
static void llama_log_softmax ( float * array , size_t size ) {
float max_l = * std : : max_element ( array , array + size ) ;
float sum = 0.f ;
for ( size_t i = 0 ; i < size ; + + i ) {
float p = expf ( array [ i ] - max_l ) ;
sum + = p ;
array [ i ] = p ;
}
for ( size_t i = 0 ; i < size ; + + i ) {
array [ i ] = logf ( array [ i ] / sum ) ;
}
}
void llama_sample_classifier_free_guidance (
struct llama_context * ctx ,
llama_token_data_array * candidates ,
struct llama_context * guidance_ctx ,
2023-07-21 12:58:36 +02:00
float scale ) {
2023-07-16 23:01:45 +02:00
int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-07-11 18:18:43 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx ) ;
2023-09-28 21:42:38 +02:00
auto n_vocab = llama_n_vocab ( llama_get_model ( ctx ) ) ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( n_vocab = = ( int ) candidates - > size ) ;
GGML_ASSERT ( ! candidates - > sorted ) ;
2023-07-11 18:18:43 +02:00
std : : vector < float > logits_base ;
logits_base . reserve ( candidates - > size ) ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
logits_base . push_back ( candidates - > data [ i ] . logit ) ;
}
llama_log_softmax ( logits_base . data ( ) , candidates - > size ) ;
float * logits_guidance = llama_get_logits ( guidance_ctx ) ;
llama_log_softmax ( logits_guidance , n_vocab ) ;
for ( int i = 0 ; i < n_vocab ; + + i ) {
float logit_guidance = logits_guidance [ i ] ;
float logit_base = logits_base [ i ] ;
2023-07-21 12:58:36 +02:00
candidates - > data [ i ] . logit = scale * ( logit_base - logit_guidance ) + logit_guidance ;
2023-07-11 18:18:43 +02:00
}
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
llama_token llama_sample_token_mirostat ( struct llama_context * ctx , llama_token_data_array * candidates , float tau , float eta , int m , float * mu ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx ) ;
2023-09-28 21:42:38 +02:00
auto N = float ( llama_n_vocab ( llama_get_model ( ctx ) ) ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
int64_t t_start_sample_us ;
t_start_sample_us = ggml_time_us ( ) ;
llama_sample_softmax ( nullptr , candidates ) ;
// Estimate s_hat using the most probable m tokens
float s_hat = 0.0 ;
float sum_ti_bi = 0.0 ;
float sum_ti_sq = 0.0 ;
for ( size_t i = 0 ; i < size_t ( m - 1 ) & & i < candidates - > size - 1 ; + + i ) {
float t_i = logf ( float ( i + 2 ) / float ( i + 1 ) ) ;
float b_i = logf ( candidates - > data [ i ] . p / candidates - > data [ i + 1 ] . p ) ;
sum_ti_bi + = t_i * b_i ;
sum_ti_sq + = t_i * t_i ;
}
s_hat = sum_ti_bi / sum_ti_sq ;
// Compute k from the estimated s_hat and target surprise value
float epsilon_hat = s_hat - 1 ;
float k = powf ( ( epsilon_hat * powf ( 2 , * mu ) ) / ( 1 - powf ( N , - epsilon_hat ) ) , 1 / s_hat ) ;
// Sample the next word X using top-k sampling
2023-05-06 23:01:47 +02:00
llama_sample_top_k ( nullptr , candidates , int ( k ) , 1 ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
llama_token X = llama_sample_token ( ctx , candidates ) ;
t_start_sample_us = ggml_time_us ( ) ;
// Compute error as the difference between observed surprise and target surprise value
size_t X_idx = std : : distance ( candidates - > data , std : : find_if ( candidates - > data , candidates - > data + candidates - > size , [ & ] ( const llama_token_data & candidate ) {
return candidate . id = = X ;
} ) ) ;
float observed_surprise = - log2f ( candidates - > data [ X_idx ] . p ) ;
float e = observed_surprise - tau ;
// Update mu using the learning rate and error
* mu = * mu - eta * e ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
return X ;
}
llama_token llama_sample_token_mirostat_v2 ( struct llama_context * ctx , llama_token_data_array * candidates , float tau , float eta , float * mu ) {
int64_t t_start_sample_us ;
t_start_sample_us = ggml_time_us ( ) ;
llama_sample_softmax ( ctx , candidates ) ;
// Truncate the words with surprise values greater than mu
candidates - > size = std : : distance ( candidates - > data , std : : find_if ( candidates - > data , candidates - > data + candidates - > size , [ & ] ( const llama_token_data & candidate ) {
return - log2f ( candidate . p ) > * mu ;
} ) ) ;
train : improved training-from-scratch example (#1652)
* add python wrapper
https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce
* fix decoding error. adds errors=ignore parameter
* add python bindings for functions to get and set the whole llama state
(rng, logits, embedding and kv_cache)
* update python bindings
* add text generating baby-llama from scratch example
* fix race condition bug in ggml_compute_forward_diag_mask_f32
* implement ggml_soft_max_back for more performant backward pass of soft_max
avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss
* improve softmax backward pass
go from quadratic runtime to linear runtime by simplifying the formulas
* fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32
memcpy needs to be synchronized across threads to avoid race conditions.
=> do it in INIT phase
* fix bug in ggml_compute_forward_soft_max_back_f32 on DEBUG build
* improve performance of mul_mat backward pass
avoid transpose by using mul_mat with swapped arguments
* avoid printing too much newlines in baby-llama-text
* activate threading in baby-llama-text
* add ggml_out_prod and use it for mul_mat backward pass for improved performance
performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests
* better weight initialization improves training convergence at start
* better weight initialization improves training convergence at start
* improve ggml_out_prod performance
- change iteration order (>15s -> 10s runtime)
- parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime)
* add llama sampler, shuffle samples and constrain sampling to tokens occurring in train data
* fix get_samples call, add model tensor names, increase model size, start training samples after newline
* save train trained model to checkpoint and load model to be trained from checkpoint
* use inplace functions where possible
* initialize rng with srand
* use different arguments for input and output checkpoint
* ggml fixes to support backward pass on inplace operations
* remove duplicate include
* fix cross entropy loss
- add target probabilities for each sample which is then used in cross entropy loss
* print used memory before and after optimization
* sample with non-greedy sampling parameters at the end of training
* add cmake target for baby-llama-text
* add ggml_add1_inplace to header
* enable gradient propagation for inplace add1 and scale operations
those functions backward passes don't need the original src0, so they also work when forward is inplace
* implement AdamW in ggml_opt_adam by adding weight decay parameter (default 0.001f)
also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule.
setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer.
since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer.
* use inplace operations in cross_entropy_loss
* fix random weight initialization scale
* add missing default parameters for adam optimizer
* add ggml_opt_context, so that we can properly resume training
otherwise the optimizer states, tracking statistics about the error function and its derivates,
will reset to zero each time ggml_opt is called, hindering convergence on resumed training.
now the optimizer context and all its memory is stored in a separate struct.
* fix bug in llama_sample_token_mirostat_v2
when all candidates are filtered out through mu threshold, the following soft_max operation will fail.
so keep at least one.
* add forward function without using cache, for more performant training
during training on whole samples no cache is required.
removing the cache and simplifying the remaining code results in performance and memory usage improvement.
* print suppressed newline tokens as string "\n"
printing too much actual newlines is suppressed to avoid flooding the console.
* store optimizer state in training checkpoint and add learning schedule
persistent optimizer state allows to resume training without resetting the optimizer
learning schedule consists of linear warmup ramp followed by cosine decay with restarts
* remove unused functions
* fix bug in get_samples which corrupted training targets
* save checkpoint only when it was trained
* simplify code
* remove trailing whitespace
* simplify backward pass for SQRT
* replace inefficient repeat backward pass with dedicated repeat_back operation
* add ggml_cross_entropy_loss with backward pass for faster training
cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead.
* add tests for cross_entropy_loss backward pass
finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient.
_probably_ the finite differences fails due to numerical issues
* use ggml_cross_entropy_loss in text training example
* remove trailing whitespace
* slightly improve how cross entropy loss is compute
btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log.
probably the input to log gets closer to zero due to float numerics.
maybe the multiplication by (1.0-eps)/sum is more accurate..
* add llama_get_vocab to get the vocabulary as output parameters
* set default model.type for unknown models with few layers
* add export of training checkpoint to llama compatible model file
* get vocabulary for exporting training checkpoint to llama compatible model file
* implement backward pass of flash attention
* bugfixes for backward pass of flash attention
* test flash attention backward pass
need to set loose error bounds to pass.
the finitie differences are close to numeric limits and often return quite different values than the backward pass.
reducing eps further lets the gradients vanish completely.
likewise setting eps to big results in wronger values.
the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences.
* add option to train with flash attention and move options to the top of the main function
training from scratch also works with flash attention
training convergence and generation results after fix number of iterations are worse than when not using flash attention.
maybe there still lingers a bug in the flash attention backward pass?
but training works, just with slower convergence.
flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx
* add train_params and command line option parser
* remove unnecessary comments
* add train params to specify memory size
* remove python bindings
* rename baby-llama-text to train-text-from-scratch
* replace auto parameters in lambda function
* add #include <climits>
* add explicit cast to fix compile error
"error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]"
* remove trailing whitespace
* add ggml_opt_resume_g which accepts forward and backward cgraphs
* fix formulas in comments
* bug fix for ggml_compute_forward_get_rows_back_f32
the result should be set to zero, not to whatever data is in opt0
* improve training memory usage with scratch buffers
instead of relying on the automatic backward pass, we manually create the graph for the backward pass.
it turns out that all backward pass operations need only temporary memory which can be reused after each layer.
will compute backward pass for ALL model parameters
* add option to use scratch buffers in training or not
make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters.
* ci : disable temporary
* store view offset and permute axes in opt[0] instead of storing it in padding
use memcpy to store offset, because offset is of type size_t.
when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true.
* minor : fix compile warnings + minor style changes
* fix bug in threaded indices calculation of ggml_compute_forward_flash_attn_back_f32
* store view offset like in master branch
* bug fix in forward_batch_wo_cache_flash_attn_train
* scratch buffer bug fixes in forward_batch_wo_cache_flash_attn_train
data of permute and reshape is the same as their input.
if we want to preserve the output of permute/reshape, we also need to preserve their inputs.
replace reshape(src0, src1) with reshape_nd calls so that we don't need src1.
replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02).
in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls.
for this we need backward pass of broadcasting ggml_mul.
* remove unnecessary scratch buffer 0
buf 0 is persistent memory, so we can just disable scratch for this by using buf -1
* avoid creating unnecessary grad tensors
previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
this wasted memory, because unnecessary grad for each op were automatically created:
the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
this discarded the automatically generated grad resulting in wasted memory.
improved this by changing expand(..) to not use ggml_build_forward_expand.
expand set cgraph->nodes but not the leafs.
cgraph->leafs & cgraph->grads are set in another pass after the last expand call.
* print used training seed
* zero initialize gfbuf and gbbuf
* ci : re-enable workflows + add README for training
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-13 21:04:40 +02:00
if ( candidates - > size = = 0 ) {
candidates - > size = 1 ;
}
2023-07-05 12:31:23 +02:00
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
// Normalize the probabilities of the remaining words
llama_sample_softmax ( ctx , candidates ) ;
// Sample the next word X from the remaining words
llama_token X = llama_sample_token ( ctx , candidates ) ;
t_start_sample_us = ggml_time_us ( ) ;
// Compute error as the difference between observed surprise and target surprise value
size_t X_idx = std : : distance ( candidates - > data , std : : find_if ( candidates - > data , candidates - > data + candidates - > size , [ & ] ( const llama_token_data & candidate ) {
return candidate . id = = X ;
} ) ) ;
float observed_surprise = - log2f ( candidates - > data [ X_idx ] . p ) ;
float e = observed_surprise - tau ;
// Update mu using the learning rate and error
* mu = * mu - eta * e ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
return X ;
}
llama_token llama_sample_token_greedy ( struct llama_context * ctx , llama_token_data_array * candidates ) {
const int64_t t_start_sample_us = ggml_time_us ( ) ;
// Find max element
2023-05-13 10:23:15 +02:00
auto * max_iter = std : : max_element ( candidates - > data , candidates - > data + candidates - > size , [ ] ( const llama_token_data & a , const llama_token_data & b ) {
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
return a . logit < b . logit ;
} ) ;
llama_token result = max_iter - > id ;
if ( ctx ) {
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > n_sample + + ;
}
return result ;
}
llama_token llama_sample_token ( struct llama_context * ctx , llama_token_data_array * candidates ) {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
const int64_t t_start_sample_us = ggml_time_us ( ) ;
llama_sample_softmax ( nullptr , candidates ) ;
std : : vector < float > probs ;
probs . reserve ( candidates - > size ) ;
for ( size_t i = 0 ; i < candidates - > size ; + + i ) {
probs . push_back ( candidates - > data [ i ] . p ) ;
}
2023-03-22 06:32:36 +01:00
std : : discrete_distribution < > dist ( probs . begin ( ) , probs . end ( ) ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
auto & rng = ctx - > rng ;
2023-03-22 06:32:36 +01:00
int idx = dist ( rng ) ;
llama : new sampling algorithms (#1126)
* Sample interface, new samplers.
New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat
Ignore EOS fix: -inf should be used.
* mirostat
* Added --logit-bias and --no-penalize-nl, removed std::span
* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)
* Save and load example adjust
* Tests
* Windows build fix
* Windows test fix
2023-04-29 07:34:41 +02:00
llama_token result = candidates - > data [ idx ] . id ;
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > n_sample + + ;
return result ;
2023-03-22 06:32:36 +01:00
}
2023-07-24 05:58:10 +02:00
void llama_grammar_accept_token ( struct llama_context * ctx , struct llama_grammar * grammar , llama_token token ) {
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-08-21 22:07:43 +02:00
if ( token = = llama_token_eos ( ctx ) ) {
2023-07-24 05:58:10 +02:00
for ( const auto & stack : grammar - > stacks ) {
if ( stack . empty ( ) ) {
return ;
}
}
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( false ) ;
2023-07-24 05:58:10 +02:00
}
2023-08-27 13:19:19 +02:00
const std : : string piece = llama_token_to_str ( ctx , token ) ;
2023-08-18 01:54:44 +02:00
2023-07-24 05:58:10 +02:00
// Note terminating 0 in decoded string
2023-08-27 13:19:19 +02:00
const auto decoded = decode_utf8 ( piece . c_str ( ) , grammar - > partial_utf8 ) ;
2023-08-18 01:54:44 +02:00
const auto & code_points = decoded . first ;
2023-07-24 05:58:10 +02:00
for ( auto it = code_points . begin ( ) , end = code_points . end ( ) - 1 ; it ! = end ; + + it ) {
grammar - > stacks = llama_grammar_accept ( grammar - > rules , grammar - > stacks , * it ) ;
}
2023-08-18 01:54:44 +02:00
grammar - > partial_utf8 = decoded . second ;
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ! grammar - > stacks . empty ( ) ) ;
2023-07-24 05:58:10 +02:00
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
}
2023-08-25 17:18:48 +02:00
//
// Beam search
//
struct llama_beam {
std : : vector < llama_token > tokens ;
float p ; // Cumulative beam probability (renormalized relative to all beams)
bool eob ; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator < ( const llama_beam & rhs ) const {
return std : : make_pair ( p , eob ) < std : : make_pair ( rhs . p , rhs . eob ) ;
}
// Shift off first n tokens and discard them.
void shift_tokens ( const size_t n ) {
if ( n ) {
std : : copy ( tokens . begin ( ) + n , tokens . end ( ) , tokens . begin ( ) ) ;
tokens . resize ( tokens . size ( ) - n ) ;
}
}
llama_beam_view view ( ) const { return { tokens . data ( ) , tokens . size ( ) , p , eob } ; }
} ;
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits ;
const int n_vocab ;
const float max_l ;
const float normalizer ;
struct sum_exp {
float max_l ;
float operator ( ) ( float sum , float l ) const { return sum + std : : exp ( l - max_l ) ; }
} ;
llama_logit_info ( llama_context * ctx )
: logits ( llama_get_logits ( ctx ) )
2023-09-28 21:42:38 +02:00
, n_vocab ( llama_n_vocab ( llama_get_model ( ctx ) ) )
2023-08-25 17:18:48 +02:00
, max_l ( * std : : max_element ( logits , logits + n_vocab ) )
, normalizer ( 1.0f / std : : accumulate ( logits , logits + n_vocab , 0.0f , sum_exp { max_l } ) )
{ }
llama_token_data get_token_data ( const llama_token token_id ) const {
constexpr auto p = std : : numeric_limits < float > : : quiet_NaN ( ) ; // never used
return { token_id , logits [ token_id ] , p } ;
}
// Return top k token_data by logit.
std : : vector < llama_token_data > top_k ( size_t k ) {
std : : vector < llama_token_data > min_heap ; // min-heap by logit
const llama_token k_min = std : : min ( static_cast < llama_token > ( k ) , n_vocab ) ;
min_heap . reserve ( k_min ) ;
for ( llama_token token_id = 0 ; token_id < k_min ; + + token_id ) {
min_heap . push_back ( get_token_data ( token_id ) ) ;
}
auto comp = [ ] ( const llama_token_data & a , const llama_token_data & b ) { return a . logit > b . logit ; } ;
std : : make_heap ( min_heap . begin ( ) , min_heap . end ( ) , comp ) ;
for ( llama_token token_id = k_min ; token_id < n_vocab ; + + token_id ) {
if ( min_heap . front ( ) . logit < logits [ token_id ] ) {
std : : pop_heap ( min_heap . begin ( ) , min_heap . end ( ) , comp ) ;
min_heap . back ( ) . id = token_id ;
min_heap . back ( ) . logit = logits [ token_id ] ;
std : : push_heap ( min_heap . begin ( ) , min_heap . end ( ) , comp ) ;
}
}
return min_heap ;
}
2023-09-01 15:47:27 +02:00
float probability_from_logit ( float logit ) const {
2023-08-25 17:18:48 +02:00
return normalizer * std : : exp ( logit - max_l ) ;
}
} ;
struct llama_beam_search_data {
llama_context * ctx ;
size_t n_beams ;
int n_past ;
int n_predict ;
std : : vector < llama_beam > beams ;
std : : vector < llama_beam > next_beams ;
// Re-calculated on each loop iteration
size_t common_prefix_length ;
// Used to communicate to/from callback on beams state.
std : : vector < llama_beam_view > beam_views ;
2023-09-28 21:42:38 +02:00
llama_beam_search_data ( llama_context * ctx , size_t n_beams , int n_past , int n_predict )
2023-08-25 17:18:48 +02:00
: ctx ( ctx )
, n_beams ( n_beams )
, n_past ( n_past )
, n_predict ( n_predict )
, beam_views ( n_beams ) {
beams . reserve ( n_beams ) ;
next_beams . reserve ( n_beams ) ;
}
// Collapse beams to a single beam given by index.
void collapse_beams ( const size_t beam_idx ) {
if ( 0u < beam_idx ) {
std : : swap ( beams [ 0 ] , beams [ beam_idx ] ) ;
}
beams . resize ( 1 ) ;
}
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetative patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap.
void fill_next_beams_by_top_probabilities ( llama_beam & beam ) {
// Min-heaps use a greater-than comparator.
const auto comp = [ ] ( const llama_beam & a , const llama_beam & b ) { return a . p > b . p ; } ;
if ( beam . eob ) {
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
if ( next_beams . size ( ) < n_beams ) {
next_beams . push_back ( std : : move ( beam ) ) ;
if ( next_beams . size ( ) = = n_beams ) {
std : : make_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
}
} else if ( next_beams . front ( ) . p < beam . p ) {
std : : pop_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
next_beams . back ( ) = std : : move ( beam ) ;
std : : push_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
}
} else {
// beam is not at end-of-sentence, so branch with next top_k tokens.
if ( ! beam . tokens . empty ( ) ) {
2023-09-28 21:42:38 +02:00
llama_decode ( ctx , llama_batch_get_one ( beam . tokens . data ( ) , beam . tokens . size ( ) , n_past , 0 ) ) ;
2023-08-25 17:18:48 +02:00
}
llama_logit_info logit_info ( ctx ) ;
std : : vector < llama_token_data > next_tokens = logit_info . top_k ( n_beams ) ;
size_t i = 0 ;
if ( next_beams . size ( ) < n_beams ) {
for ( ; next_beams . size ( ) < n_beams ; + + i ) {
llama_beam next_beam = beam ;
next_beam . tokens . push_back ( next_tokens [ i ] . id ) ;
next_beam . p * = logit_info . probability_from_logit ( next_tokens [ i ] . logit ) ;
next_beams . push_back ( std : : move ( next_beam ) ) ;
}
std : : make_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
} else {
for ( ; next_beams . front ( ) . p = = 0.0f ; + + i ) {
std : : pop_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
next_beams . back ( ) = beam ;
next_beams . back ( ) . tokens . push_back ( next_tokens [ i ] . id ) ;
next_beams . back ( ) . p * = logit_info . probability_from_logit ( next_tokens [ i ] . logit ) ;
std : : push_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
}
}
for ( ; i < n_beams ; + + i ) {
const float next_p = beam . p * logit_info . probability_from_logit ( next_tokens [ i ] . logit ) ;
if ( next_beams . front ( ) . p < next_p ) {
std : : pop_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
next_beams . back ( ) = beam ;
next_beams . back ( ) . tokens . push_back ( next_tokens [ i ] . id ) ;
next_beams . back ( ) . p = next_p ;
std : : push_heap ( next_beams . begin ( ) , next_beams . end ( ) , comp ) ;
}
}
}
}
// Find common_prefix_length based on beams.
// Requires beams is not empty.
size_t find_common_prefix_length ( ) {
size_t common_prefix_length = beams [ 0 ] . tokens . size ( ) ;
for ( size_t i = 1 ; i < beams . size ( ) ; + + i ) {
common_prefix_length = std : : min ( common_prefix_length , beams [ i ] . tokens . size ( ) ) ;
for ( size_t j = 0 ; j < common_prefix_length ; + + j ) {
if ( beams [ 0 ] . tokens [ j ] ! = beams [ i ] . tokens [ j ] ) {
common_prefix_length = j ;
break ;
}
}
}
return common_prefix_length ;
}
// Construct beams_state to send back to caller via the callback function.
// Side effect: set common_prefix_length = find_common_prefix_length();
llama_beams_state get_beams_state ( const bool last_call ) {
for ( size_t i = 0 ; i < beams . size ( ) ; + + i ) {
beam_views [ i ] = beams [ i ] . view ( ) ;
}
common_prefix_length = find_common_prefix_length ( ) ;
return { beam_views . data ( ) , beams . size ( ) , common_prefix_length , last_call } ;
}
// Loop:
// * while i < n_predict, AND
// * any of the beams have not yet reached end-of-beam (eob), AND
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
// (since all other beam probabilities can only decrease)
void loop ( const llama_beam_search_callback_fn_t callback , void * const callback_data ) {
beams . push_back ( { { } , 1.0f , false } ) ; // Start with one empty beam w/ probability = 1.0 and !eob.
const auto not_eob = [ ] ( const llama_beam & beam ) { return ! beam . eob ; } ;
for ( int i = 0 ; i < n_predict & & std : : any_of ( beams . begin ( ) , beams . end ( ) , not_eob ) & &
! beams [ top_beam_index ( ) ] . eob ; + + i ) {
callback ( callback_data , get_beams_state ( false ) ) ; // Sets common_prefix_length
update_beams_from_beam_views ( ) ; // Update values (p,eob) that callback may have changed.
if ( common_prefix_length ) {
2023-09-28 21:42:38 +02:00
llama_decode ( ctx , llama_batch_get_one ( beams [ 0 ] . tokens . data ( ) , common_prefix_length , n_past , 0 ) ) ;
2023-08-25 17:18:48 +02:00
n_past + = common_prefix_length ;
}
// Zero-out next_beam probabilities to place them last in following min-heap.
std : : for_each ( next_beams . begin ( ) , next_beams . end ( ) , [ ] ( llama_beam & beam ) { beam . p = 0.0f ; } ) ;
for ( llama_beam & beam : beams ) {
beam . shift_tokens ( common_prefix_length ) ;
fill_next_beams_by_top_probabilities ( beam ) ;
}
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
beams . swap ( next_beams ) ;
renormalize_beam_probabilities ( beams ) ;
}
collapse_beams ( top_beam_index ( ) ) ;
callback ( callback_data , get_beams_state ( true ) ) ;
}
// As beams grow, the cumulative probabilities decrease.
// Renormalize them to avoid floating point underflow.
static void renormalize_beam_probabilities ( std : : vector < llama_beam > & beams ) {
const auto sum_p = [ ] ( float sum , llama_beam & beam ) { return sum + beam . p ; } ;
const float inv_sum = 1.0f / std : : accumulate ( beams . begin ( ) , beams . end ( ) , 0.0f , sum_p ) ;
std : : for_each ( beams . begin ( ) , beams . end ( ) , [ = ] ( llama_beam & beam ) { beam . p * = inv_sum ; } ) ;
}
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
size_t top_beam_index ( ) {
return std : : max_element ( beams . begin ( ) , beams . end ( ) ) - beams . begin ( ) ;
}
// Copy (p,eob) for each beam which may have been changed by the callback.
void update_beams_from_beam_views ( ) {
for ( size_t i = 0 ; i < beams . size ( ) ; + + i ) {
beams [ i ] . p = beam_views [ i ] . p ;
beams [ i ] . eob = beam_views [ i ] . eob ;
}
}
} ;
void llama_beam_search ( llama_context * ctx ,
llama_beam_search_callback_fn_t callback , void * callback_data ,
2023-09-28 21:42:38 +02:00
size_t n_beams , int n_past , int n_predict ) {
2023-08-25 17:18:48 +02:00
assert ( ctx ) ;
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-09-28 21:42:38 +02:00
llama_beam_search_data beam_search_data ( ctx , n_beams , n_past , n_predict ) ;
2023-08-25 17:18:48 +02:00
beam_search_data . loop ( callback , callback_data ) ;
ctx - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
ctx - > n_sample + + ;
}
2023-03-22 06:32:36 +01:00
//
// quantization
//
2023-09-15 03:09:53 +02:00
template < typename T >
struct no_init {
T value ;
no_init ( ) { /* do nothing */ }
} ;
static void llama_convert_tensor_internal (
struct ggml_tensor * tensor , std : : vector < no_init < float > > & output , std : : vector < std : : thread > & workers ,
const size_t nelements , const int nthread
) {
2023-08-21 22:07:43 +02:00
if ( output . size ( ) < nelements ) {
output . resize ( nelements ) ;
2023-06-10 09:59:17 +02:00
}
2023-08-21 22:07:43 +02:00
float * f32_output = ( float * ) output . data ( ) ;
2023-06-10 09:59:17 +02:00
2023-07-05 18:13:06 +02:00
ggml_type_traits_t qtype ;
2023-08-21 22:07:43 +02:00
if ( ggml_is_quantized ( tensor - > type ) ) {
qtype = ggml_internal_get_type_traits ( tensor - > type ) ;
2023-07-05 18:13:06 +02:00
if ( qtype . to_float = = NULL ) {
2023-08-21 22:07:43 +02:00
throw std : : runtime_error ( format ( " type %s unsupported for integer quantization: no dequantization available " , ggml_type_name ( tensor - > type ) ) ) ;
2023-06-10 09:59:17 +02:00
}
2023-08-21 22:07:43 +02:00
} else if ( tensor - > type ! = GGML_TYPE_F16 ) {
throw std : : runtime_error ( format ( " cannot dequantize/convert tensor type %s " , ggml_type_name ( tensor - > type ) ) ) ;
2023-06-10 09:59:17 +02:00
}
if ( nthread < 2 ) {
2023-08-21 22:07:43 +02:00
if ( tensor - > type = = GGML_TYPE_F16 ) {
ggml_fp16_to_fp32_row ( ( ggml_fp16_t * ) tensor - > data , f32_output , nelements ) ;
} else if ( ggml_is_quantized ( tensor - > type ) ) {
qtype . to_float ( tensor - > data , f32_output , nelements ) ;
2023-06-10 09:59:17 +02:00
} else {
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( false ) ; // unreachable
2023-06-10 09:59:17 +02:00
}
return ;
}
2023-08-21 22:07:43 +02:00
auto block_size = tensor - > type = = GGML_TYPE_F16 ? 1 : ( size_t ) ggml_blck_size ( tensor - > type ) ;
auto block_size_bytes = ggml_type_size ( tensor - > type ) ;
2023-06-10 09:59:17 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( nelements % block_size = = 0 ) ;
2023-06-10 09:59:17 +02:00
auto nblocks = nelements / block_size ;
auto blocks_per_thread = nblocks / nthread ;
auto spare_blocks = nblocks - ( blocks_per_thread * nthread ) ; // if blocks aren't divisible by thread count
for ( auto tnum = 0 , in_buff_offs = 0 , out_buff_offs = 0 ; tnum < nthread ; tnum + + ) {
auto thr_blocks = blocks_per_thread + ( tnum = = nthread - 1 ? spare_blocks : 0 ) ; // num blocks for this thread
auto thr_elems = thr_blocks * block_size ; // number of elements for this thread
auto thr_block_bytes = thr_blocks * block_size_bytes ; // number of input bytes for this thread
auto compute = [ qtype ] ( ggml_type typ , uint8_t * inbuf , float * outbuf , int nels ) {
if ( typ = = GGML_TYPE_F16 ) {
ggml_fp16_to_fp32_row ( ( ggml_fp16_t * ) inbuf , outbuf , nels ) ;
} else {
2023-07-05 18:13:06 +02:00
qtype . to_float ( inbuf , outbuf , nels ) ;
2023-06-10 09:59:17 +02:00
}
} ;
2023-09-15 03:09:53 +02:00
workers . emplace_back ( compute , tensor - > type , ( uint8_t * ) tensor - > data + in_buff_offs , f32_output + out_buff_offs , thr_elems ) ;
2023-06-10 09:59:17 +02:00
in_buff_offs + = thr_block_bytes ;
out_buff_offs + = thr_elems ;
}
2023-09-15 03:09:53 +02:00
for ( auto & w : workers ) { w . join ( ) ; }
workers . clear ( ) ;
}
# ifdef GGML_USE_K_QUANTS
static ggml_type get_k_quant_type (
ggml_type new_type , const ggml_tensor * tensor , const llama_model & model , llama_ftype ftype , int * i_attention_wv ,
int n_attention_wv , int * i_feed_forward_w2 , int n_feed_forward_w2
) {
const std : : string name = ggml_get_name ( tensor ) ;
// TODO: avoid hardcoded tensor names - use the TN_* constants
const auto tn = LLM_TN ( model . arch ) ;
auto use_more_bits = [ ] ( int i_layer , int num_layers ) - > bool {
return i_layer < num_layers / 8 | | i_layer > = 7 * num_layers / 8 | | ( i_layer - num_layers / 8 ) % 3 = = 2 ;
} ;
if ( name = = tn ( LLM_TENSOR_OUTPUT , " weight " ) ) {
int nx = tensor - > ne [ 0 ] ;
if ( model . arch = = LLM_ARCH_FALCON | | nx % QK_K ! = 0 ) {
new_type = GGML_TYPE_Q8_0 ;
}
else if ( new_type ! = GGML_TYPE_Q8_0 ) {
new_type = GGML_TYPE_Q6_K ;
}
} else if ( name . find ( " attn_v.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) {
new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K ;
else if ( ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M ) & &
use_more_bits ( * i_attention_wv , n_attention_wv ) ) new_type = GGML_TYPE_Q6_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S & & * i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K ;
else if ( QK_K = = 64 & & ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S | | ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_S ) & &
( * i_attention_wv < n_attention_wv / 8 | | * i_attention_wv > = 7 * n_attention_wv / 8 ) ) new_type = GGML_TYPE_Q6_K ;
if ( model . type = = MODEL_70B ) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if ( new_type = = GGML_TYPE_Q3_K | | new_type = = GGML_TYPE_Q4_K ) new_type = GGML_TYPE_Q5_K ;
}
+ + * i_attention_wv ;
} else if ( name . find ( " ffn_down.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) {
new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
: model . arch ! = LLM_ARCH_FALCON | | use_more_bits ( * i_feed_forward_w2 , n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) {
new_type = model . arch = = LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K ;
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M ) {
if ( model . arch = = LLM_ARCH_FALCON ) {
new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
use_more_bits ( * i_feed_forward_w2 , n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K ;
} else {
if ( use_more_bits ( * i_feed_forward_w2 , n_feed_forward_w2 ) ) new_type = GGML_TYPE_Q6_K ;
}
}
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M & & use_more_bits ( * i_feed_forward_w2 , n_feed_forward_w2 ) ) new_type = GGML_TYPE_Q6_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_S & & model . arch ! = LLM_ARCH_FALCON & & * i_feed_forward_w2 < 4 ) {
new_type = GGML_TYPE_Q5_K ;
}
+ + * i_feed_forward_w2 ;
} else if ( name . find ( " attn_output.weight " ) ! = std : : string : : npos ) {
if ( model . arch ! = LLM_ARCH_FALCON ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K ;
} else {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q4_K ;
}
}
else if ( name . find ( " attn_qkv.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_M | | ftype = = LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q4_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q4_K_M ) new_type = GGML_TYPE_Q5_K ;
else if ( ftype = = LLAMA_FTYPE_MOSTLY_Q5_K_M ) new_type = GGML_TYPE_Q6_K ;
}
else if ( name . find ( " ffn_gate.weight " ) ! = std : : string : : npos | | name . find ( " ffn_up.weight " ) ! = std : : string : : npos ) {
if ( ftype = = LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K ;
}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false ;
if ( new_type = = GGML_TYPE_Q2_K | | new_type = = GGML_TYPE_Q3_K | | new_type = = GGML_TYPE_Q4_K | |
new_type = = GGML_TYPE_Q5_K | | new_type = = GGML_TYPE_Q6_K ) {
int nx = tensor - > ne [ 0 ] ;
int ny = tensor - > ne [ 1 ] ;
if ( nx % QK_K ! = 0 ) {
LLAMA_LOG_WARN ( " \n \n %s : tensor cols %d x %d are not divisible by %d, required for k-quants \n " , __func__ , nx , ny , QK_K ) ;
convert_incompatible_tensor = true ;
}
}
if ( convert_incompatible_tensor ) {
if ( name = = tn ( LLM_TENSOR_OUTPUT , " weight " ) ) {
new_type = GGML_TYPE_F16 ; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN ( " F16 will be used for this tensor instead. \n " ) ;
} else if ( name = = tn ( LLM_TENSOR_TOKEN_EMBD , " weight " ) ) {
new_type = GGML_TYPE_Q4_0 ; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN ( " Q4_0 will be used for this tensor instead. \n " ) ;
} else {
throw std : : runtime_error ( " Unsupported tensor size encountered \n " ) ;
}
2023-06-10 09:59:17 +02:00
}
2023-09-15 03:09:53 +02:00
return new_type ;
2023-06-10 09:59:17 +02:00
}
2023-09-15 03:09:53 +02:00
# endif
2023-06-10 09:59:17 +02:00
static void llama_model_quantize_internal ( const std : : string & fname_inp , const std : : string & fname_out , const llama_model_quantize_params * params ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
ggml_type quantized_type ;
2023-06-10 09:59:17 +02:00
llama_ftype ftype = params - > ftype ;
switch ( params - > ftype ) {
2023-04-11 17:03:51 +02:00
case LLAMA_FTYPE_MOSTLY_Q4_0 : quantized_type = GGML_TYPE_Q4_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q4_1 : quantized_type = GGML_TYPE_Q4_1 ; break ;
2023-04-26 22:14:13 +02:00
case LLAMA_FTYPE_MOSTLY_Q5_0 : quantized_type = GGML_TYPE_Q5_0 ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_1 : quantized_type = GGML_TYPE_Q5_1 ; break ;
2023-04-25 22:40:51 +02:00
case LLAMA_FTYPE_MOSTLY_Q8_0 : quantized_type = GGML_TYPE_Q8_0 ; break ;
2023-07-22 20:17:57 +02:00
case LLAMA_FTYPE_MOSTLY_F16 : quantized_type = GGML_TYPE_F16 ; break ;
case LLAMA_FTYPE_ALL_F32 : quantized_type = GGML_TYPE_F32 ; break ;
2023-06-05 22:24:29 +02:00
2023-06-13 12:23:23 +02:00
# ifdef GGML_USE_K_QUANTS
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
// K-quants
2023-06-05 22:24:29 +02:00
case LLAMA_FTYPE_MOSTLY_Q2_K : quantized_type = GGML_TYPE_Q2_K ; break ;
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
case LLAMA_FTYPE_MOSTLY_Q3_K_S :
case LLAMA_FTYPE_MOSTLY_Q3_K_M :
case LLAMA_FTYPE_MOSTLY_Q3_K_L : quantized_type = GGML_TYPE_Q3_K ; break ;
case LLAMA_FTYPE_MOSTLY_Q4_K_S :
case LLAMA_FTYPE_MOSTLY_Q4_K_M : quantized_type = GGML_TYPE_Q4_K ; break ;
case LLAMA_FTYPE_MOSTLY_Q5_K_S :
case LLAMA_FTYPE_MOSTLY_Q5_K_M : quantized_type = GGML_TYPE_Q5_K ; break ;
2023-06-05 22:24:29 +02:00
case LLAMA_FTYPE_MOSTLY_Q6_K : quantized_type = GGML_TYPE_Q6_K ; break ;
2023-06-13 12:23:23 +02:00
# endif
2023-06-05 22:24:29 +02:00
default : throw std : : runtime_error ( format ( " invalid output file type %d \n " , ftype ) ) ;
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
}
2023-03-22 06:32:36 +01:00
2023-08-21 22:07:43 +02:00
int nthread = params - > nthread ;
2023-04-20 19:42:27 +02:00
if ( nthread < = 0 ) {
nthread = std : : thread : : hardware_concurrency ( ) ;
}
2023-09-29 15:48:45 +02:00
// mmap consistently increases speed Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
# if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true ;
# else
constexpr bool use_mmap = false ;
# endif
llama_model_loader ml ( fname_inp , use_mmap ) ;
if ( ml . use_mmap ) {
ml . mapping . reset ( new llama_mmap ( & ml . file , /* prefetch */ 0 , ggml_is_numa ( ) ) ) ;
}
2023-08-21 22:07:43 +02:00
2023-08-26 16:27:49 +02:00
llama_model model ;
2023-09-28 21:42:38 +02:00
llm_load_arch ( ml , model ) ;
llm_load_hparams ( ml , model ) ;
2023-08-26 16:27:49 +02:00
2023-09-01 16:02:48 +02:00
if ( params - > only_copy ) {
ftype = model . ftype ;
}
2023-08-21 22:07:43 +02:00
const size_t align = GGUF_DEFAULT_ALIGNMENT ;
struct gguf_context * ctx_out = gguf_init_empty ( ) ;
// copy the KV pairs from the input file
2023-09-28 21:42:38 +02:00
gguf_set_kv ( ctx_out , ml . ctx_gguf ) ;
2023-08-21 22:07:43 +02:00
gguf_set_val_u32 ( ctx_out , " general.quantization_version " , GGML_QNT_VERSION ) ;
2023-08-22 19:05:59 +02:00
gguf_set_val_u32 ( ctx_out , " general.file_type " , ftype ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-06-13 12:23:23 +02:00
# ifdef GGML_USE_K_QUANTS
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
int n_attention_wv = 0 ;
int n_feed_forward_w2 = 0 ;
2023-08-21 22:07:43 +02:00
2023-09-28 21:42:38 +02:00
for ( int i = 0 ; i < ml . n_tensors ; + + i ) {
struct ggml_tensor * meta = ml . get_tensor_meta ( i ) ;
2023-08-21 22:07:43 +02:00
const std : : string name = ggml_get_name ( meta ) ;
// TODO: avoid hardcoded tensor names - use the TN_* constants
if ( name . find ( " attn_v.weight " ) ! = std : : string : : npos ) {
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
+ + n_attention_wv ;
}
2023-08-21 22:07:43 +02:00
else if ( name . find ( " ffn_down.weight " ) ! = std : : string : : npos ) {
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
+ + n_feed_forward_w2 ;
}
}
2023-08-26 16:27:49 +02:00
if ( n_attention_wv ! = n_feed_forward_w2 | | ( uint32_t ) n_attention_wv ! = model . hparams . n_layer ) {
LLAMA_LOG_WARN ( " %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d \n " ,
__func__ , n_attention_wv , n_feed_forward_w2 , model . hparams . n_layer ) ;
}
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
int i_attention_wv = 0 ;
int i_feed_forward_w2 = 0 ;
2023-06-13 12:23:23 +02:00
# endif
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
size_t total_size_org = 0 ;
size_t total_size_new = 0 ;
std : : vector < int64_t > hist_all ( 1 < < 4 , 0 ) ;
2023-04-20 19:42:27 +02:00
std : : vector < std : : thread > workers ;
2023-09-15 03:09:53 +02:00
workers . reserve ( nthread ) ;
2023-04-20 19:42:27 +02:00
std : : mutex mutex ;
2023-08-21 22:07:43 +02:00
int idx = 0 ;
2023-09-15 03:09:53 +02:00
std : : vector < no_init < uint8_t > > read_data ;
std : : vector < no_init < uint8_t > > work ;
std : : vector < no_init < float > > f32_conv_buf ;
2023-08-21 22:07:43 +02:00
// populate the original tensors so we get an initial meta data
2023-09-28 21:42:38 +02:00
for ( int i = 0 ; i < ml . n_tensors ; + + i ) {
struct ggml_tensor * meta = ml . get_tensor_meta ( i ) ;
2023-08-21 22:07:43 +02:00
gguf_add_tensor ( ctx_out , meta ) ;
}
std : : ofstream fout ( fname_out , std : : ios : : binary ) ;
const size_t meta_size = gguf_get_meta_size ( ctx_out ) ;
LLAMA_LOG_INFO ( " %s: meta size = %zu bytes \n " , __func__ , meta_size ) ;
// placeholder for the meta data
: : zeros ( fout , meta_size ) ;
2023-09-28 21:42:38 +02:00
for ( int i = 0 ; i < ml . n_tensors ; + + i ) {
struct ggml_tensor * tensor = ml . get_tensor_meta ( i ) ;
2023-08-21 22:07:43 +02:00
const std : : string name = ggml_get_name ( tensor ) ;
2023-09-29 15:48:45 +02:00
if ( ! ml . use_mmap ) {
if ( read_data . size ( ) < ggml_nbytes ( tensor ) ) {
read_data . resize ( ggml_nbytes ( tensor ) ) ;
}
tensor - > data = read_data . data ( ) ;
2023-09-15 03:09:53 +02:00
}
2023-09-28 21:42:38 +02:00
ml . load_data_for ( tensor ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " [%4d/%4d] %36s - [%s], type = %6s, " ,
2023-09-28 21:42:38 +02:00
+ + idx , ml . n_tensors ,
2023-08-21 22:07:43 +02:00
ggml_get_name ( tensor ) ,
llama_format_tensor_shape ( tensor ) . c_str ( ) ,
ggml_type_name ( tensor - > type ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
// This used to be a regex, but <regex> has an extreme cost to compile times.
2023-08-21 22:07:43 +02:00
bool quantize = name . rfind ( " weight " ) = = name . size ( ) - 6 ; // ends with 'weight'?
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
// quantize only 2D tensors
2023-08-21 22:07:43 +02:00
quantize & = ( tensor - > n_dims = = 2 ) ;
quantize & = params - > quantize_output_tensor | | name ! = " output.weight " ;
2023-09-01 16:02:48 +02:00
quantize & = ! params - > only_copy ;
2023-04-20 22:32:59 +02:00
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
enum ggml_type new_type ;
void * new_data ;
size_t new_size ;
2023-09-01 16:02:48 +02:00
if ( quantize ) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
new_type = quantized_type ;
2023-06-13 12:23:23 +02:00
# ifdef GGML_USE_K_QUANTS
2023-09-15 03:09:53 +02:00
new_type = get_k_quant_type (
new_type , tensor , model , ftype , & i_attention_wv , n_attention_wv , & i_feed_forward_w2 , n_feed_forward_w2
) ;
2023-06-13 12:23:23 +02:00
# endif
2023-09-01 16:02:48 +02:00
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor - > type ! = new_type ;
}
if ( ! quantize ) {
new_type = tensor - > type ;
new_data = tensor - > data ;
new_size = ggml_nbytes ( tensor ) ;
LLAMA_LOG_INFO ( " size = %8.3f MB \n " , ggml_nbytes ( tensor ) / 1024.0 / 1024.0 ) ;
} else {
2023-08-21 22:07:43 +02:00
const size_t nelements = ggml_nelements ( tensor ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
float * f32_data ;
2023-06-10 09:59:17 +02:00
2023-08-21 22:07:43 +02:00
if ( tensor - > type = = GGML_TYPE_F32 ) {
f32_data = ( float * ) tensor - > data ;
} else if ( ggml_is_quantized ( tensor - > type ) & & ! params - > allow_requantize ) {
throw std : : runtime_error ( format ( " requantizing from type %s is disabled " , ggml_type_name ( tensor - > type ) ) ) ;
2023-03-22 06:32:36 +01:00
} else {
2023-09-15 03:09:53 +02:00
llama_convert_tensor_internal ( tensor , f32_conv_buf , workers , nelements , nthread ) ;
2023-08-21 22:07:43 +02:00
f32_data = ( float * ) f32_conv_buf . data ( ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " quantizing to %s .. " , ggml_type_name ( new_type ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
fflush ( stdout ) ;
2023-09-15 03:09:53 +02:00
if ( work . size ( ) < nelements * 4 ) {
work . resize ( nelements * 4 ) ; // upper bound on size
}
2023-08-21 22:07:43 +02:00
new_data = work . data ( ) ;
2023-09-15 03:09:53 +02:00
std : : array < int64_t , 1 < < 4 > hist_cur = { } ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
2023-08-21 22:07:43 +02:00
static const int chunk_size = 32 * 512 ;
2023-04-20 19:42:27 +02:00
const int nchunk = ( nelements + chunk_size - 1 ) / chunk_size ;
const int nthread_use = nthread > 1 ? std : : max ( 1 , std : : min ( nthread , nchunk ) ) : 1 ;
if ( nthread_use < 2 ) {
new_size = ggml_quantize_chunk ( new_type , f32_data , new_data , 0 , nelements , hist_cur . data ( ) ) ;
} else {
size_t counter = 0 ;
new_size = 0 ;
2023-08-21 22:07:43 +02:00
auto compute = [ & mutex , & counter , & hist_cur , & new_size , new_type , f32_data , new_data , nelements ] ( ) {
2023-09-15 03:09:53 +02:00
std : : array < int64_t , 1 < < 4 > local_hist = { } ;
2023-04-20 19:42:27 +02:00
size_t local_size = 0 ;
while ( true ) {
std : : unique_lock < std : : mutex > lock ( mutex ) ;
size_t first = counter ; counter + = chunk_size ;
if ( first > = nelements ) {
2023-09-15 03:09:53 +02:00
if ( local_size > 0 ) {
2023-05-13 10:23:15 +02:00
for ( int j = 0 ; j < int ( local_hist . size ( ) ) ; + + j ) {
hist_cur [ j ] + = local_hist [ j ] ;
}
2023-04-20 19:42:27 +02:00
new_size + = local_size ;
}
break ;
}
lock . unlock ( ) ;
size_t last = std : : min ( nelements , first + chunk_size ) ;
local_size + = ggml_quantize_chunk ( new_type , f32_data , new_data , first , last - first , local_hist . data ( ) ) ;
}
} ;
2023-05-13 10:23:15 +02:00
for ( int it = 0 ; it < nthread_use - 1 ; + + it ) {
2023-09-15 03:09:53 +02:00
workers . emplace_back ( compute ) ;
2023-05-13 10:23:15 +02:00
}
2023-04-20 19:42:27 +02:00
compute ( ) ;
2023-09-15 03:09:53 +02:00
for ( auto & w : workers ) { w . join ( ) ; }
workers . clear ( ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " size = %8.2f MB -> %8.2f MB | hist: " , ggml_nbytes ( tensor ) / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 ) ;
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
int64_t tot_count = 0 ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
for ( size_t i = 0 ; i < hist_cur . size ( ) ; i + + ) {
hist_all [ i ] + = hist_cur [ i ] ;
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
tot_count + = hist_cur [ i ] ;
2023-03-29 22:51:37 +02:00
}
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
if ( tot_count > 0 ) {
for ( size_t i = 0 ; i < hist_cur . size ( ) ; i + + ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %5.3f " , hist_cur [ i ] / float ( nelements ) ) ;
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
}
2023-03-22 06:32:36 +01:00
}
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " \n " ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
total_size_org + = ggml_nbytes ( tensor ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
total_size_new + = new_size ;
2023-08-21 22:07:43 +02:00
// update the gguf meta data as we go
gguf_set_tensor_type ( ctx_out , name . c_str ( ) , new_type ) ;
gguf_set_tensor_data ( ctx_out , name . c_str ( ) , new_data , new_size ) ;
// write tensor data + padding
fout . write ( ( const char * ) new_data , new_size ) ;
zeros ( fout , GGML_PAD ( new_size , align ) - new_size ) ;
}
// go back to beginning of file and write the updated meta data
{
fout . seekp ( 0 ) ;
std : : vector < uint8_t > data ( gguf_get_meta_size ( ctx_out ) ) ;
gguf_get_meta_data ( ctx_out , data . data ( ) ) ;
fout . write ( ( const char * ) data . data ( ) , data . size ( ) ) ;
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
}
2023-03-22 06:32:36 +01:00
2023-08-21 22:07:43 +02:00
fout . close ( ) ;
gguf_free ( ctx_out ) ;
LLAMA_LOG_INFO ( " %s: model size = %8.2f MB \n " , __func__ , total_size_org / 1024.0 / 1024.0 ) ;
LLAMA_LOG_INFO ( " %s: quant size = %8.2f MB \n " , __func__ , total_size_new / 1024.0 / 1024.0 ) ;
// print histogram for all tensors
{
int64_t sum_all = 0 ;
for ( size_t i = 0 ; i < hist_all . size ( ) ; i + + ) {
sum_all + = hist_all [ i ] ;
}
if ( sum_all > 0 ) {
LLAMA_LOG_INFO ( " %s: hist: " , __func__ ) ;
for ( size_t i = 0 ; i < hist_all . size ( ) ; i + + ) {
LLAMA_LOG_INFO ( " %5.3f " , hist_all [ i ] / float ( sum_all ) ) ;
}
LLAMA_LOG_INFO ( " \n " ) ;
}
}
}
2023-09-15 21:38:27 +02:00
static int llama_apply_lora_from_file_internal (
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
const struct llama_model & model , const char * path_lora , float scale , const char * path_base_model , int n_threads
2023-09-15 21:38:27 +02:00
) {
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: applying lora adapter from '%s' - please wait ... \n " , __func__ , path_lora ) ;
const int64_t t_start_lora_us = ggml_time_us ( ) ;
auto fin = std : : ifstream ( path_lora , std : : ios : : binary ) ;
if ( ! fin ) {
LLAMA_LOG_ERROR ( " %s: failed to open '%s' \n " , __func__ , path_lora ) ;
return 1 ;
}
// verify magic and version
{
uint32_t magic ;
fin . read ( ( char * ) & magic , sizeof ( magic ) ) ;
uint32_t format_version ;
fin . read ( ( char * ) & format_version , sizeof ( format_version ) ) ;
if ( format_version ! = 1 ) {
LLAMA_LOG_ERROR ( " %s: unsupported file version \n " , __func__ ) ;
return 1 ;
}
}
int32_t lora_r ;
int32_t lora_alpha ;
fin . read ( ( char * ) & lora_r , sizeof ( lora_r ) ) ;
fin . read ( ( char * ) & lora_alpha , sizeof ( lora_alpha ) ) ;
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
float scaling = scale * ( float ) lora_alpha / ( float ) lora_r ;
2023-08-21 22:07:43 +02:00
LLAMA_LOG_INFO ( " %s: r = %d, alpha = %d, scaling = %.2f \n " , __func__ , lora_r , lora_alpha , scaling ) ;
// create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor
std : : vector < uint8_t > lora_buf ( 1024ull * 1024ull * 1024ull ) ;
struct ggml_init_params params ;
params . mem_size = lora_buf . size ( ) ;
params . mem_buffer = lora_buf . data ( ) ;
params . no_alloc = false ;
ggml_context * lora_ctx = ggml_init ( params ) ;
std : : unordered_map < std : : string , struct ggml_tensor * > lora_tensors ;
// create a name -> tensor map of the model to accelerate lookups
std : : unordered_map < std : : string , struct ggml_tensor * > model_tensors ;
for ( const auto & kv : model . tensors_by_name ) {
model_tensors . insert ( kv ) ;
}
// load base model
2023-08-23 22:08:04 +02:00
std : : unique_ptr < llama_model_loader > ml ;
2023-08-21 22:07:43 +02:00
ggml_context * base_ctx = NULL ;
std : : vector < uint8_t > base_buf ;
if ( path_base_model ) {
LLAMA_LOG_INFO ( " %s: loading base model from '%s' \n " , __func__ , path_base_model ) ;
2023-08-23 22:08:04 +02:00
ml . reset ( new llama_model_loader ( path_base_model , /*use_mmap*/ true ) ) ;
2023-08-21 22:07:43 +02:00
size_t ctx_size ;
size_t mmapped_size ;
2023-08-23 22:08:04 +02:00
ml - > calc_sizes ( ctx_size , mmapped_size ) ;
2023-08-21 22:07:43 +02:00
base_buf . resize ( ctx_size ) ;
ggml_init_params base_params ;
base_params . mem_size = base_buf . size ( ) ;
base_params . mem_buffer = base_buf . data ( ) ;
2023-08-23 22:08:04 +02:00
base_params . no_alloc = ml - > use_mmap ;
2023-08-21 22:07:43 +02:00
base_ctx = ggml_init ( base_params ) ;
// maybe this should in llama_model_loader
2023-08-23 22:08:04 +02:00
if ( ml - > use_mmap ) {
ml - > mapping . reset ( new llama_mmap ( & ml - > file , /* prefetch */ 0 , ggml_is_numa ( ) ) ) ;
2023-08-21 22:07:43 +02:00
}
}
// read tensors and apply
bool warned = false ;
int n_tensors = 0 ;
std : : vector < uint8_t > work_buffer ;
while ( true ) {
int32_t n_dims ;
int32_t length ;
int32_t ftype ;
fin . read ( reinterpret_cast < char * > ( & n_dims ) , sizeof ( n_dims ) ) ;
fin . read ( reinterpret_cast < char * > ( & length ) , sizeof ( length ) ) ;
fin . read ( reinterpret_cast < char * > ( & ftype ) , sizeof ( ftype ) ) ;
if ( fin . eof ( ) ) {
break ;
}
int32_t ne [ 2 ] = { 1 , 1 } ;
for ( int i = 0 ; i < n_dims ; + + i ) {
fin . read ( reinterpret_cast < char * > ( & ne [ i ] ) , sizeof ( ne [ i ] ) ) ;
}
std : : string name ;
{
char buf [ 1024 ] ;
fin . read ( buf , length ) ;
name = std : : string ( buf , length ) ;
}
// check for lora suffix and get the type of tensor
const std : : string lora_suffix = " .lora " ;
size_t pos = name . rfind ( lora_suffix ) ;
if ( pos = = std : : string : : npos ) {
LLAMA_LOG_ERROR ( " %s: error: '%s' is not a lora tensor \n " , __func__ , name . c_str ( ) ) ;
return 1 ;
}
std : : string lora_type = name . substr ( pos + lora_suffix . length ( ) ) ;
std : : string base_name = name ;
base_name . erase ( pos ) ;
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
if ( model_tensors . find ( base_name ) = = model_tensors . end ( ) ) {
LLAMA_LOG_ERROR ( " %s: unknown tensor '%s' in lora adapter \n " , __func__ , name . data ( ) ) ;
return 1 ;
}
// create ggml tensor
ggml_type wtype ;
switch ( ftype ) {
case 0 : wtype = GGML_TYPE_F32 ; break ;
case 1 : wtype = GGML_TYPE_F16 ; break ;
default :
{
LLAMA_LOG_ERROR ( " %s: invalid tensor data type '%d' \n " ,
__func__ , ftype ) ;
return false ;
}
}
ggml_tensor * lora_tensor ;
if ( n_dims = = 2 ) {
lora_tensor = ggml_new_tensor_2d ( lora_ctx , wtype , ne [ 0 ] , ne [ 1 ] ) ;
}
else {
LLAMA_LOG_ERROR ( " %s: unsupported tensor dimension %d \n " , __func__ , n_dims ) ;
return 1 ;
}
ggml_set_name ( lora_tensor , " lora_tensor " ) ;
// load tensor data
size_t offset = fin . tellg ( ) ;
size_t tensor_data_size = ggml_nbytes ( lora_tensor ) ;
offset = ( offset + 31 ) & - 32 ;
fin . seekg ( offset ) ;
fin . read ( ( char * ) lora_tensor - > data , tensor_data_size ) ;
lora_tensors [ name ] = lora_tensor ;
// check if we have both A and B tensors and apply
if ( lora_tensors . find ( base_name + " .loraA " ) ! = lora_tensors . end ( ) & &
lora_tensors . find ( base_name + " .loraB " ) ! = lora_tensors . end ( ) ) {
ggml_tensor * dest_t = model_tensors [ base_name ] ;
offload_func_t offload_func = llama_nop ;
offload_func_t offload_func_force_inplace = llama_nop ;
# ifdef GGML_USE_CUBLAS
if ( dest_t - > backend = = GGML_BACKEND_GPU | | dest_t - > backend = = GGML_BACKEND_GPU_SPLIT ) {
if ( dest_t - > type ! = GGML_TYPE_F16 ) {
throw std : : runtime_error ( format (
" %s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models " , __func__ ) ) ;
}
offload_func = ggml_cuda_assign_buffers ;
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace ;
}
# endif // GGML_USE_CUBLAS
ggml_tensor * base_t ;
2023-08-23 22:08:04 +02:00
if ( ml ) {
struct gguf_context * ctx_gguf = ml - > ctx_gguf ;
2023-08-21 22:07:43 +02:00
// load from base model
if ( gguf_find_tensor ( ctx_gguf , base_name . c_str ( ) ) < 0 ) {
2023-08-23 22:08:04 +02:00
// TODO: throw
2023-08-21 22:07:43 +02:00
LLAMA_LOG_ERROR ( " %s: error: tensor '%s' not found in base model \n " , __func__ , base_name . c_str ( ) ) ;
return 1 ;
}
// TODO: not tested!! maybe not working!
2023-08-23 22:08:04 +02:00
base_t = ml - > create_tensor ( base_ctx , base_name , { ( uint32_t ) dest_t - > ne [ 0 ] , ( uint32_t ) dest_t - > ne [ 1 ] } , GGML_BACKEND_CPU ) ;
ml - > load_data_for ( base_t ) ;
2023-08-21 22:07:43 +02:00
} else {
base_t = dest_t ;
}
if ( ggml_is_quantized ( base_t - > type ) ) {
if ( ! warned ) {
LLAMA_LOG_WARN ( " %s: warning: using a lora adapter with a quantized model may result in poor quality, "
" use a f16 or f32 base model with --lora-base \n " , __func__ ) ;
warned = true ;
}
}
ggml_tensor * loraA = lora_tensors [ base_name + " .loraA " ] ;
GGML_ASSERT ( loraA - > type = = GGML_TYPE_F32 ) ;
ggml_set_name ( loraA , " loraA " ) ;
ggml_tensor * loraB = lora_tensors [ base_name + " .loraB " ] ;
GGML_ASSERT ( loraB - > type = = GGML_TYPE_F32 ) ;
ggml_set_name ( loraB , " loraB " ) ;
if ( base_t - > ne [ 0 ] ! = loraA - > ne [ 1 ] | | base_t - > ne [ 1 ] ! = loraB - > ne [ 1 ] ) {
LLAMA_LOG_ERROR ( " %s: incompatible tensor dimensions (% " PRId64 " and % " PRId64 " ); "
" are you sure that this adapter is for this model? \n " , __func__ , base_t - > ne [ 0 ] , loraA - > ne [ 1 ] ) ;
return 1 ;
}
// w = w + BA*s
ggml_tensor * BA = ggml_mul_mat ( lora_ctx , loraA , loraB ) ;
offload_func ( BA ) ;
ggml_set_name ( BA , " BA " ) ;
if ( scaling ! = 1.0f ) {
ggml_tensor * scale_tensor = ggml_new_f32 ( lora_ctx , scaling ) ;
ggml_set_name ( scale_tensor , " scale_tensor " ) ;
BA = ggml_scale_inplace ( lora_ctx , BA , scale_tensor ) ;
offload_func ( BA ) ;
ggml_set_name ( BA , " BA_scaled " ) ;
}
ggml_tensor * r ;
if ( base_t = = dest_t ) {
r = ggml_add_inplace ( lora_ctx , dest_t , BA ) ;
offload_func_force_inplace ( r ) ;
ggml_set_name ( r , " r_add_inplace " ) ;
}
else {
r = ggml_add ( lora_ctx , base_t , BA ) ;
offload_func ( r ) ;
ggml_set_name ( r , " r_add " ) ;
r = ggml_cpy ( lora_ctx , r , dest_t ) ;
offload_func ( r ) ;
ggml_set_name ( r , " r_cpy " ) ;
}
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
struct ggml_cgraph * gf = ggml_new_graph ( lora_ctx ) ;
ggml_build_forward_expand ( gf , r ) ;
2023-08-21 22:07:43 +02:00
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
ggml_graph_compute_helper ( work_buffer , gf , n_threads ) ;
2023-08-21 22:07:43 +02:00
// we won't need these tensors again, reset the context to save memory
ggml_free ( lora_ctx ) ;
lora_ctx = ggml_init ( params ) ;
lora_tensors . clear ( ) ;
n_tensors + + ;
if ( n_tensors % 4 = = 0 ) {
LLAMA_LOG_INFO ( " . " ) ;
}
}
}
// TODO: this should be in a destructor, it will leak on failure
ggml_free ( lora_ctx ) ;
if ( base_ctx ) {
ggml_free ( base_ctx ) ;
}
const int64_t t_lora_us = ggml_time_us ( ) - t_start_lora_us ;
LLAMA_LOG_INFO ( " done (%.2f ms) \n " , t_lora_us / 1000.0 ) ;
return 0 ;
}
//
// interface implementation
//
2023-09-28 21:42:38 +02:00
struct llama_model_params llama_model_default_params ( ) {
struct llama_model_params result = {
2023-09-04 21:26:24 +02:00
/*.n_gpu_layers =*/ 0 ,
2023-08-21 22:07:43 +02:00
/*.main_gpu =*/ 0 ,
/*.tensor_split =*/ nullptr ,
/*.progress_callback =*/ nullptr ,
/*.progress_callback_user_data =*/ nullptr ,
/*.vocab_only =*/ false ,
/*.use_mmap =*/ true ,
/*.use_mlock =*/ false ,
} ;
2023-09-04 21:26:24 +02:00
# ifdef GGML_USE_METAL
result . n_gpu_layers = 1 ;
# endif
2023-08-21 22:07:43 +02:00
return result ;
}
2023-09-28 21:42:38 +02:00
struct llama_context_params llama_context_default_params ( ) {
struct llama_context_params result = {
/*.seed =*/ LLAMA_DEFAULT_SEED ,
/*.n_ctx =*/ 512 ,
/*.n_batch =*/ 512 ,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS , // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS ,
/*.rope_freq_base =*/ 0.0f ,
/*.rope_freq_scale =*/ 0.0f ,
/*.mul_mat_q =*/ true ,
/*.f16_kv =*/ true ,
/*.logits_all =*/ false ,
/*.embedding =*/ false ,
} ;
return result ;
}
2023-08-21 22:07:43 +02:00
struct llama_model_quantize_params llama_model_quantize_default_params ( ) {
struct llama_model_quantize_params result = {
/*.nthread =*/ 0 ,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1 ,
/*.allow_requantize =*/ false ,
/*.quantize_output_tensor =*/ true ,
2023-09-01 16:02:48 +02:00
/*.only_copy =*/ false ,
2023-08-21 22:07:43 +02:00
} ;
return result ;
}
int llama_max_devices ( void ) {
return LLAMA_MAX_DEVICES ;
}
bool llama_mmap_supported ( void ) {
return llama_mmap : : SUPPORTED ;
}
bool llama_mlock_supported ( void ) {
return llama_mlock : : SUPPORTED ;
}
void llama_backend_init ( bool numa ) {
ggml_time_init ( ) ;
2023-03-22 06:32:36 +01:00
2023-08-21 22:07:43 +02:00
// needed to initialize f16 tables
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
{
2023-08-21 22:07:43 +02:00
struct ggml_init_params params = { 0 , NULL , false } ;
struct ggml_context * ctx = ggml_init ( params ) ;
ggml_free ( ctx ) ;
}
2023-03-22 06:32:36 +01:00
2023-08-21 22:07:43 +02:00
if ( numa ) {
ggml_numa_init ( ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
# ifdef GGML_USE_MPI
ggml_mpi_backend_init ( ) ;
# endif
}
2023-06-28 17:53:37 +02:00
2023-08-21 22:07:43 +02:00
void llama_backend_free ( void ) {
# ifdef GGML_USE_MPI
ggml_mpi_backend_free ( ) ;
# endif
}
2023-06-28 17:53:37 +02:00
2023-08-21 22:07:43 +02:00
int64_t llama_time_us ( void ) {
return ggml_time_us ( ) ;
}
2023-03-22 06:32:36 +01:00
2023-06-24 10:47:58 +02:00
struct llama_model * llama_load_model_from_file (
2023-03-22 06:32:36 +01:00
const char * path_model ,
2023-09-28 21:42:38 +02:00
struct llama_model_params params ) {
2023-03-22 06:32:36 +01:00
ggml_time_init ( ) ;
2023-06-24 10:47:58 +02:00
llama_model * model = new llama_model ;
2023-08-23 22:08:04 +02:00
unsigned cur_percentage = 0 ;
if ( params . progress_callback = = NULL ) {
params . progress_callback_user_data = & cur_percentage ;
params . progress_callback = [ ] ( float progress , void * ctx ) {
unsigned * cur_percentage_p = ( unsigned * ) ctx ;
unsigned percentage = ( unsigned ) ( 100 * progress ) ;
while ( percentage > * cur_percentage_p ) {
* cur_percentage_p = percentage ;
LLAMA_LOG_INFO ( " . " ) ;
if ( percentage > = 100 ) {
LLAMA_LOG_INFO ( " \n " ) ;
}
}
} ;
}
2023-09-28 21:42:38 +02:00
if ( ! llama_model_load ( path_model , * model , params . n_gpu_layers ,
params . main_gpu , params . tensor_split ,
params . use_mmap , params . use_mlock , params . vocab_only ,
2023-08-21 22:07:43 +02:00
params . progress_callback , params . progress_callback_user_data ) ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s: failed to load model \n " , __func__ ) ;
2023-06-24 10:47:58 +02:00
delete model ;
return nullptr ;
}
return model ;
}
void llama_free_model ( struct llama_model * model ) {
delete model ;
}
struct llama_context * llama_new_context_with_model (
2023-07-07 18:24:01 +02:00
struct llama_model * model ,
struct llama_context_params params ) {
2023-06-24 10:47:58 +02:00
if ( ! model ) {
return nullptr ;
}
2023-07-14 20:55:24 +02:00
llama_context * ctx = new llama_context ( * model ) ;
2023-03-22 06:32:36 +01:00
2023-09-28 21:42:38 +02:00
const auto & hparams = model - > hparams ;
auto & cparams = ctx - > cparams ;
cparams . n_batch = params . n_batch ;
cparams . n_ctx = params . n_ctx = = 0 ? hparams . n_ctx_train : params . n_ctx ;
cparams . rope_freq_base = params . rope_freq_base = = 0 ? hparams . rope_freq_base_train : params . rope_freq_base ;
cparams . rope_freq_scale = params . rope_freq_scale = = 0 ? hparams . rope_freq_scale_train : params . rope_freq_scale ;
cparams . n_threads = params . n_threads ;
cparams . n_threads_batch = params . n_threads_batch ;
cparams . mul_mat_q = params . mul_mat_q ;
2023-06-29 15:15:15 +02:00
if ( params . seed = = LLAMA_DEFAULT_SEED ) {
2023-03-22 06:45:00 +01:00
params . seed = time ( NULL ) ;
}
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: n_ctx = %u \n " , __func__ , cparams . n_ctx ) ;
LLAMA_LOG_INFO ( " %s: freq_base = %.1f \n " , __func__ , cparams . rope_freq_base ) ;
LLAMA_LOG_INFO ( " %s: freq_scale = %g \n " , __func__ , cparams . rope_freq_scale ) ;
2023-03-22 06:32:36 +01:00
ctx - > rng = std : : mt19937 ( params . seed ) ;
ctx - > logits_all = params . logits_all ;
2023-03-24 22:17:37 +01:00
ggml_type memory_type = params . f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32 ;
2023-03-22 06:32:36 +01:00
2023-03-24 16:05:13 +01:00
// reserve memory for context buffers
2023-09-28 21:42:38 +02:00
if ( ! hparams . vocab_only ) {
if ( ! llama_kv_cache_init ( ctx - > model . hparams , ctx - > kv_self , memory_type , cparams . n_ctx , model - > n_gpu_layers ) ) {
2023-08-21 22:07:43 +02:00
LLAMA_LOG_ERROR ( " %s: llama_kv_cache_init() failed for self-attention cache \n " , __func__ ) ;
2023-03-24 22:17:37 +01:00
llama_free ( ctx ) ;
return nullptr ;
}
{
2023-06-24 10:47:58 +02:00
const size_t memory_size = ggml_nbytes ( ctx - > kv_self . k ) + ggml_nbytes ( ctx - > kv_self . v ) ;
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: kv self size = %7.2f MB \n " , __func__ , memory_size / 1024.0 / 1024.0 ) ;
2023-03-24 22:17:37 +01:00
}
2023-03-25 19:51:14 +01:00
// resized during inference
2023-03-24 16:05:13 +01:00
if ( params . logits_all ) {
2023-09-28 21:42:38 +02:00
ctx - > logits . reserve ( cparams . n_ctx * hparams . n_vocab ) ;
2023-03-24 16:05:13 +01:00
} else {
2023-04-22 08:21:32 +02:00
ctx - > logits . reserve ( hparams . n_vocab ) ;
2023-03-24 16:05:13 +01:00
}
if ( params . embedding ) {
2023-03-25 19:51:14 +01:00
ctx - > embedding . resize ( hparams . n_embd ) ;
2023-03-24 16:05:13 +01:00
}
2023-03-24 22:17:37 +01:00
2023-07-30 15:58:01 +02:00
{
static const size_t tensor_alignment = 32 ;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
ctx - > buf_compute . resize ( ggml_tensor_overhead ( ) * GGML_MAX_NODES + ggml_graph_overhead ( ) ) ;
// create measure allocator
ctx - > alloc = ggml_allocr_new_measure ( tensor_alignment ) ;
// build worst-case graph
2023-09-28 21:42:38 +02:00
int n_tokens = ( int ) std : : min ( cparams . n_ctx , cparams . n_batch ) ;
int n_past = cparams . n_ctx - n_tokens ;
2023-08-21 22:07:43 +02:00
llama_token token = llama_token_bos ( ctx ) ; // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
2023-09-28 21:42:38 +02:00
ggml_cgraph * gf = llama_build_graph ( * ctx , llama_batch_get_one ( & token , n_tokens , n_past , 0 ) ) ;
2023-09-28 18:04:36 +02:00
2023-08-16 22:08:28 +02:00
# ifdef GGML_USE_METAL
2023-09-28 21:42:38 +02:00
if ( model - > n_gpu_layers > 0 ) {
2023-10-02 12:49:59 +02:00
ggml_metal_log_set_callback ( llama_log_callback_default , NULL ) ;
2023-08-16 22:08:28 +02:00
ctx - > ctx_metal = ggml_metal_init ( 1 ) ;
if ( ! ctx - > ctx_metal ) {
LLAMA_LOG_ERROR ( " %s: ggml_metal_init() failed \n " , __func__ ) ;
llama_free ( ctx ) ;
return NULL ;
}
2023-09-28 18:04:36 +02:00
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
2023-08-16 22:08:28 +02:00
}
# endif
2023-07-30 15:58:01 +02:00
// measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph ( ctx - > alloc , gf ) + tensor_alignment ;
2023-09-28 21:42:38 +02:00
LLAMA_LOG_INFO ( " %s: compute buffer total size = %.2f MB \n " , __func__ , ( ctx - > buf_compute . size + alloc_size ) / 1024.0 / 1024.0 ) ;
2023-07-30 15:58:01 +02:00
// recreate allocator with exact memory requirements
ggml_allocr_free ( ctx - > alloc ) ;
ctx - > buf_alloc . resize ( alloc_size ) ;
2023-08-21 22:07:43 +02:00
ctx - > alloc = ggml_allocr_new ( ctx - > buf_alloc . data , ctx - > buf_alloc . size , tensor_alignment ) ;
2023-08-16 22:08:28 +02:00
# ifdef GGML_USE_METAL
if ( ctx - > ctx_metal ) {
2023-09-28 18:04:36 +02:00
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
2023-08-16 22:08:28 +02:00
}
# endif
2023-08-22 15:25:19 +02:00
# ifdef GGML_USE_CUBLAS
2023-09-28 21:42:38 +02:00
ggml_cuda_set_scratch_size ( alloc_size ) ;
LLAMA_LOG_INFO ( " %s: VRAM scratch buffer: %.2f MB \n " , __func__ , alloc_size / 1024.0 / 1024.0 ) ;
// calculate total VRAM usage
auto add_tensor = [ ] ( const ggml_tensor * t , size_t & size ) {
if ( t - > backend = = GGML_BACKEND_GPU | | t - > backend = = GGML_BACKEND_GPU_SPLIT ) {
size + = ggml_nbytes ( t ) ;
}
} ;
size_t model_vram_size = 0 ;
for ( const auto & kv : model - > tensors_by_name ) {
add_tensor ( kv . second , model_vram_size ) ;
2023-08-22 15:25:19 +02:00
}
2023-09-28 21:42:38 +02:00
size_t kv_vram_size = 0 ;
add_tensor ( ctx - > kv_self . k , kv_vram_size ) ;
add_tensor ( ctx - > kv_self . v , kv_vram_size ) ;
size_t ctx_vram_size = alloc_size + kv_vram_size ;
size_t total_vram_size = model_vram_size + ctx_vram_size ;
LLAMA_LOG_INFO ( " %s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB) \n " , __func__ ,
total_vram_size / 1024.0 / 1024.0 ,
model_vram_size / 1024.0 / 1024.0 ,
ctx_vram_size / 1024.0 / 1024.0 ) ;
2023-07-30 15:58:01 +02:00
# endif
2023-08-22 15:25:19 +02:00
}
2023-03-24 16:05:13 +01:00
2023-06-04 22:34:30 +02:00
# ifdef GGML_USE_METAL
2023-09-28 21:42:38 +02:00
if ( model - > n_gpu_layers > 0 ) {
2023-09-04 21:26:24 +02:00
// this allocates all Metal resources and memory buffers
2023-08-14 15:37:39 +02:00
2023-09-04 21:26:24 +02:00
void * data_ptr = NULL ;
size_t data_size = 0 ;
2023-06-18 08:09:47 +02:00
2023-09-28 21:42:38 +02:00
if ( ctx - > model . mapping ) {
2023-09-04 21:26:24 +02:00
data_ptr = ctx - > model . mapping - > addr ;
data_size = ctx - > model . mapping - > size ;
} else {
data_ptr = ggml_get_mem_buffer ( ctx - > model . ctx ) ;
data_size = ggml_get_mem_size ( ctx - > model . ctx ) ;
}
2023-06-04 22:34:30 +02:00
2023-09-04 21:26:24 +02:00
const size_t max_size = ggml_get_max_tensor_size ( ctx - > model . ctx ) ;
2023-06-18 08:09:47 +02:00
2023-09-04 21:26:24 +02:00
LLAMA_LOG_INFO ( " %s: max tensor size = %8.2f MB \n " , __func__ , max_size / 1024.0 / 1024.0 ) ;
2023-06-28 18:35:54 +02:00
2023-08-21 22:07:43 +02:00
# define LLAMA_METAL_CHECK_BUF(result) \
2023-09-04 21:26:24 +02:00
if ( ! ( result ) ) { \
LLAMA_LOG_ERROR ( " %s: failed to add buffer \n " , __func__ ) ; \
llama_free ( ctx ) ; \
return NULL ; \
}
2023-06-28 18:35:54 +02:00
2023-09-28 21:42:38 +02:00
LLAMA_METAL_CHECK_BUF ( ggml_metal_add_buffer ( ctx - > ctx_metal , " data " , data_ptr , data_size , max_size ) ) ;
LLAMA_METAL_CHECK_BUF ( ggml_metal_add_buffer ( ctx - > ctx_metal , " kv " , ctx - > kv_self . buf . data , ctx - > kv_self . buf . size , 0 ) ) ;
2023-09-04 21:26:24 +02:00
LLAMA_METAL_CHECK_BUF ( ggml_metal_add_buffer ( ctx - > ctx_metal , " alloc " , ctx - > buf_alloc . data , ctx - > buf_alloc . size , 0 ) ) ;
2023-08-21 22:07:43 +02:00
# undef LLAMA_METAL_CHECK_BUF
2023-09-04 21:26:24 +02:00
}
2023-08-21 22:07:43 +02:00
# endif
2023-09-04 21:26:24 +02:00
}
2023-06-28 18:35:54 +02:00
2023-08-21 22:07:43 +02:00
# ifdef GGML_USE_MPI
ctx - > ctx_mpi = ggml_mpi_init ( ) ;
2023-04-17 17:28:55 +02:00
2023-08-21 22:07:43 +02:00
if ( ggml_mpi_rank ( ctx - > ctx_mpi ) > 0 ) {
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2023-09-28 18:04:36 +02:00
// TODO: needs fix after #3228
GGML_ASSERT ( false & & " not implemented " ) ;
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2023-08-21 22:07:43 +02:00
llama_backend_free ( ) ;
exit ( 1 ) ;
}
# endif
2023-04-17 17:28:55 +02:00
2023-08-21 22:07:43 +02:00
return ctx ;
}
2023-04-17 17:28:55 +02:00
2023-08-21 22:07:43 +02:00
void llama_free ( struct llama_context * ctx ) {
delete ctx ;
}
2023-06-28 18:35:54 +02:00
2023-09-28 21:42:38 +02:00
const llama_model * llama_get_model ( const struct llama_context * ctx ) {
return & ctx - > model ;
2023-08-21 22:07:43 +02:00
}
2023-04-17 17:28:55 +02:00
2023-08-21 22:07:43 +02:00
int llama_n_ctx ( const struct llama_context * ctx ) {
2023-09-28 21:42:38 +02:00
return ctx - > cparams . n_ctx ;
2023-08-21 22:07:43 +02:00
}
2023-07-07 18:24:01 +02:00
2023-09-28 21:42:38 +02:00
enum llama_vocab_type llama_vocab_type ( const struct llama_model * model ) {
return model - > vocab . type ;
2023-08-21 22:07:43 +02:00
}
2023-04-17 17:28:55 +02:00
2023-09-28 21:42:38 +02:00
int llama_n_vocab ( const struct llama_model * model ) {
2023-08-21 22:07:43 +02:00
return model - > vocab . id_to_token . size ( ) ;
}
2023-04-17 17:28:55 +02:00
2023-09-28 21:42:38 +02:00
int llama_n_ctx_train ( const struct llama_model * model ) {
2023-09-08 17:43:35 +02:00
return model - > hparams . n_ctx_train ;
}
2023-09-28 21:42:38 +02:00
int llama_n_embd ( const struct llama_model * model ) {
2023-08-21 22:07:43 +02:00
return model - > hparams . n_embd ;
}
2023-04-17 17:28:55 +02:00
2023-10-03 19:09:28 +02:00
float llama_rope_freq_scale_train ( const struct llama_model * model ) {
return model - > hparams . rope_freq_scale_train ;
}
2023-08-25 15:16:19 +02:00
int llama_model_desc ( const struct llama_model * model , char * buf , size_t buf_size ) {
2023-08-23 22:08:04 +02:00
return snprintf ( buf , buf_size , " %s %s %s " ,
2023-09-28 21:42:38 +02:00
llama_model_arch_name ( model - > arch ) . c_str ( ) ,
2023-08-23 22:08:04 +02:00
llama_model_type_name ( model - > type ) ,
llama_model_ftype_name ( model - > ftype ) . c_str ( ) ) ;
2023-08-21 22:07:43 +02:00
}
2023-04-17 17:28:55 +02:00
2023-08-25 15:16:19 +02:00
uint64_t llama_model_size ( const struct llama_model * model ) {
uint64_t size = 0 ;
for ( const auto & it : model - > tensors_by_name ) {
size + = ggml_nbytes ( it . second ) ;
}
return size ;
}
uint64_t llama_model_n_params ( const struct llama_model * model ) {
uint64_t nparams = 0 ;
for ( const auto & it : model - > tensors_by_name ) {
nparams + = ggml_nelements ( it . second ) ;
}
return nparams ;
}
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
struct ggml_tensor * llama_get_model_tensor ( struct llama_model * model , const char * name ) {
return ggml_get_tensor ( model - > ctx , name ) ;
}
2023-08-21 22:07:43 +02:00
int llama_model_quantize (
const char * fname_inp ,
const char * fname_out ,
const llama_model_quantize_params * params ) {
try {
llama_model_quantize_internal ( fname_inp , fname_out , params ) ;
return 0 ;
} catch ( const std : : exception & err ) {
LLAMA_LOG_ERROR ( " %s: failed to quantize: %s \n " , __func__ , err . what ( ) ) ;
return 1 ;
}
2023-04-17 17:28:55 +02:00
}
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
int llama_apply_lora_from_file ( struct llama_context * ctx , const char * path_lora , float scale , const char * path_base_model , int n_threads ) {
2023-04-17 17:28:55 +02:00
try {
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
return llama_apply_lora_from_file_internal ( ctx - > model , path_lora , scale , path_base_model , n_threads ) ;
2023-06-24 10:47:58 +02:00
} catch ( const std : : exception & err ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s: failed to apply lora adapter: %s \n " , __func__ , err . what ( ) ) ;
2023-06-24 10:47:58 +02:00
return 1 ;
}
}
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
int llama_model_apply_lora_from_file ( const struct llama_model * model , const char * path_lora , float scale , const char * path_base_model , int n_threads ) {
2023-06-24 10:47:58 +02:00
try {
train : finetune LORA (#2632)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add API functions to access llama model tensors
* add stub example for finetuning, based on train-text-from-scratch
* move and remove code
* add API functions to access remaining model parameters:
mult, head and rot
* first draft for LORA finetune training
* remove const model and layer arguments in API functions for accessing model tensors
* bug fixes to make finetune compile
automatic allocator does not work yet
* add debug prints for training memory improvements
* fix names of lora tensors
* avoid stack overflow resulting from big ggml_cgraph
replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
* replace llama API functions to get model tensors by one function to get model tensor by name
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
* remove unused call to not existing llama_get_layer_from_model
* implement ggml_compute_forward_out_prod_q_f32
* remove trailing whitespace
* add lora finetune support on quantized base model tensors
* add ggml_add_cast API function
this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
* use ggml_add_cast in finetuning
lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
* bug fix: actually use result type passed to ggml_add_cast
* make sure base model tensors data cannot be used in viewable operations
memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
* fix bug in ggml_out_prod which resulted in wrong n_dims of result tensors
* avoid keeping in memory ALL of the gradients
The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.
During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.
To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
* remove trailing whitespace
* remove debug prints and function to compute tensor data hash
* improve optimization iteration prints
* adjust maximal values to support finetuning 3B models
* change default finetune params lora_r and lora_alpha to match the n_rank parameters of 4
* bug fix: make sure finetune input gradient is allocated at begin and kept until end
* remove unnecessary src tensor from ggml_get_rows_back
we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
* remove unnecessary src tensor from ggml_repeat & ggml_repeat_back
we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.
the computational graph is still completely determined, because the output shape is naturally included
* resolve todo
allocator will only make it inplace when they are of the same type
* mixing multiple LORA adapters is now possible
pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
* add option to save finetune output every N iterations
* also save latest finetune output with ITERATION="LATEST" and print where files are saved
saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
* update checkpoint train stats before saving via "--save-every"
* add command line option `--rank-wo N` for rank of wo tensor
* update finetune README
* fix dump_non_result_info_yaml to output multiple lora adapters
* bug fix: replace GGML_TYPE_SIZE[t] by ggml_type_size(t)
* replace llama_n_mult by llama_n_ff
* finetune bug fixes to compile with merged in code from master
* remove prediction related code to reduce duplicated code with main
use main instead
* reduce large memory overhead in train-text-from-scratch
all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
* add comment explaining why finetune checkpoints are allocated in one block
* make default value of float member a float literal
* handle rms_norm and rope parameters the same as in train-text-from-scratch
* remove unused code
* remove vocab related code as it is unnecessary
* add LLM_KV_TRAINING_TYPE to train-text-from-scratch checkpoints
so that they can be differentiated from lora finetune checkpoints
* add gguf constants and load/save functions from train-text-from-scratch
* add load & save lora finetune checkpoints via gguf
* add python script to convert old finetune checkpoint files to gguf
* remove old checkpoint save & load code
* remove code to print data checksums which was used to verify correctness of new gguf code
* omit tokenization when training is disabled, only save llama lora adapter
training can be disabled by passing '-n 0' to finetune
* remove trailing whitespace
* update README.md
* implement ggml_compute_forward_repeat_f16
* avoid stack overflow of large cgraphs in test-grad0
* add ggml API functions ggml_unravel_index, ggml_get_i32_nd and its analogs for set and for f32
ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.
this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
* increase test-grad0 context mem size to accommodate for bigger cgraph
* add sanity check to ggml_compute_backward, asserting the correct shape of gradients
* fix ggml_acc_or_set to return tensor of correct shape
* remove unused 'inplace' argument from ggml_compute_backward function
inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
* add missing argument 'int i0' to ggml_get_i32_nd & ggml_set_i32_nd header declarations
* fix error message in ggml_allocr_alloc to display actual max_avail
* fix check_gradient
ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
* use tensor->view_src instead of ggml_is_view and get_view_source
* move gradient checkpointing code into ggml, new API function:
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
* replace custom data getters and setters by ggml functions
* train-text-from-scratch can train (full finetune) gguf models
just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.
tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
* remove trailing whitespace
* add option to save train-text-from-scratch output every N iterations
* update README.md
* fix warnings
* fix warnings
* remove finetune option to disable allocator
the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
* add tensor checkpoints only when gradient checkpointing is enabled
* initialize opt ggml context if none was provided
* add ggml-alloc API function 'ggml_allocr_max_size' to get max size of alloc
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
* finetune: automatically allocate all memory and changes to command line options
remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
* add finetune to Makefile
* update README.md
* print time per iteration and estimate remaining time
* increase measured alloc size by tensor_alignment
ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
* fix README.md
* add some more allocator debug prints
* bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue
* revert last commit
"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"
"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."
This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
* remove unnecessary "0x" before "%p" output
* move measurement memory segment to upper region of the address space
* update README.md
* fix printf format warnings
* add missing gguf_free in load_checkpoint_lora_file
* load default rms_norm and rope parameters from base model
* add gradient accumulation
specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
* fix tracking of train_samples and train_tokens
* build : fix compile warnings
* ggml : fix L-BFGS linesearch loop
* improve finetune time measurement
fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
* specify default lora rank with '--lora-r N'
'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
* fix gradient accumulation bug where the same batch was used for each microstep
* fix gradient accumulation bug where the same batch was used for each microstep
* support grouped-query-attention in ggml_flash_attn and ggml_flash_attn_back
k and v can now be repeated in q along ne[2]
in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.
in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.
since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.
we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.
change test-grad0 to also test for repeated k/v in q.
this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
* add llama API functions to get grouped-query-attention n_head parameter 'n_head_kv'.
* fix finetune to support grouped-query-attention (using flash-attention)
note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
* support broadcastable a in out_prod(a, b) and backward pass of broadcasting mul_mat(a, b)
* test broadcasting mul_mat backward pass
* decouple random number generator of each operation test
when changing one test the rng of others tests is not influenced anymore
* add comment briefly describing what ggml_repeat_back does
* simplify broadcasting mul_mat backward using ggml_repeat_back
* add cgraph evaluation order member and corresponding enum type
this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
* measure max compute size for each cgraph eval order and use best order
this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
* remove unused command line options
* add sample start patterns and options to force new or by default resume last shuffling
* update shuffle rng state on reshuffle
* exclude known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* remove probably unnecessary exception type flags from stringstream
* pass correct max number of tokens to llama_tokenize
* account for possible leading whitespace that will be added by tokenizer
e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
* use unrolled vec_mad in out_prod
y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.
ggml_vec_mad_f32_unroll will internally loop over x and v with same y.
GGML_VEC_MAD_UNROLL is by default defined to 32.
This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.
Full measurements of out-prod runtime in ms:
unroll_xv unroll_yv
1 67014.643 87826.469
2 77117.552 89077.656
4 72091.311 109121.657
8 61077.543 88678.334
16 56914.67 79514.947
24 59024.595 84350.254
28 55952.446 83368.73
32 51476.658 85177.745
36 55973.792 84659.92
40 55139.616 93844.738
48 60736.392 93330.267
64 99856.878 116994.99
Second column is when unrollying yv instead of xv
* set lora_alpha to value of lora_r if it is not set via command line
otherwise only changing lora_r will change scaling of lora adapter used in prediction
* reshuffle original sample order instead of the previous shuffled order
otherwise resumed reshuffle will not result in same sample order
* block tiling for out-prod inspired by mul-mat
block sizes are empirically optimized
roughly doubles the flops of out-prod
* exclude some more known zero values from computations in flash_attn_f32 & flash_attn_back_f32
* add static keywords
* remove outcommented old code
* update train-text-from-scratch with tokenization, sample selection and shuffling from finetune
* remove lbfgs related train parameters
* move common train functions into common/train.[h|cpp]
* move train state into struct train_state
* move train data saving code into callback to unify code of opt_callback
train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
* move common train params into common/train
* move common opt_callback into common/train
* fix consume_common_train_arg
* save and load head_count_kv in lora checkpoints
* increase train_samples by used_samples instead of number of batches
on batch can contain more than one sample when option "fill_with_next_samples" is used
* fix usage of llama_tokenize
* remove static from process_escape since we need it exposed in header
* fix code formating of long function declarations
* fix condition in load_train_state_gguf
* use die("msg") instead of replace GGML_ASSERT(!"msg") or throw std::runtime_error("msg")
* fix saving and loading of training type
* remove terminating '\0' from tokenization
(llama_tokenize is now passed the string length instead of relying on terminating '\0')
* fix compile warnings
* fix compile warnings
* use new/delete for train_state instead of malloc/free
using malloc may result in seg faults when trying to assign string fields
* assert that sample_count > 0, avoiding division by zero
* fix frand to return value in interval [0,1)
* add train option "--sample-random-offsets"
Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.
For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.
With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
* deduplicate code into function
* remove n_rot hparam, as it must always be hparam.n_embd_head()
* align code
* assert correct base model tensor shapes
* move some params from lora hparams into model hparams and load model params from gguf
this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
* remove now unnecessary llama API functions to get model params that where added by this PR
* train-text-from-scratch: automatically allocate model tensors, remove option '--mem-model N'
* train-text-from-scratch: automatically allocate opt context
* train-text-from-scratch: automatically allocate input tensors
* train-text-from-scratch: automatically allocate compute memory
* remove unused options and equalize train-text-from-scratch with finetune
* initialize opt->loss_after with zero
* add export-lora program
* remove trailing whitespace
* add export-lora build in Makefile
* remove unused struct tensor_info from export-lora
* add export-lora build dependency to llama
because it depends on common, which depends on llama
* update finetune README.md
* cancel optimization when specified number of epochs is completed
* improve handling of export-lora arguments
print errors and warnings when files could not be read or created
* Fix export-lora.cpp "not enough space in the context's memory pool" (#1)
* Fix export-lora.cpp "not enough space in the context's memory pool"
Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".
* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16
---------
Co-authored-by: xaedes <xaedes@gmail.com>
* improve handling of not yet supported tensor types
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
2023-09-28 20:40:11 +02:00
return llama_apply_lora_from_file_internal ( * model , path_lora , scale , path_base_model , n_threads ) ;
2023-06-05 22:24:29 +02:00
} catch ( const std : : exception & err ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s: failed to apply lora adapter: %s \n " , __func__ , err . what ( ) ) ;
2023-04-17 17:28:55 +02:00
return 1 ;
}
}
2023-05-01 09:24:20 +02:00
int llama_get_kv_cache_token_count ( const struct llama_context * ctx ) {
2023-09-28 18:04:36 +02:00
return ctx - > kv_self . head ;
2023-04-02 12:23:04 +02:00
}
2023-09-28 18:04:36 +02:00
void llama_kv_cache_tokens_rm ( struct llama_context * ctx , int32_t c0 , int32_t c1 ) {
llama_kv_cache_tokens_rm ( ctx - > kv_self , c0 , c1 ) ;
}
2023-04-24 06:40:02 +02:00
2023-09-28 18:04:36 +02:00
void llama_kv_cache_seq_rm ( struct llama_context * ctx , llama_seq_id seq_id , llama_pos p0 , llama_pos p1 ) {
llama_kv_cache_seq_rm ( ctx - > kv_self , seq_id , p0 , p1 ) ;
}
void llama_kv_cache_seq_cp ( struct llama_context * ctx , llama_seq_id seq_id_src , llama_seq_id seq_id_dst , llama_pos p0 , llama_pos p1 ) {
llama_kv_cache_seq_cp ( ctx - > kv_self , seq_id_src , seq_id_dst , p0 , p1 ) ;
}
void llama_kv_cache_seq_keep ( struct llama_context * ctx , llama_seq_id seq_id ) {
llama_kv_cache_seq_keep ( ctx - > kv_self , seq_id ) ;
}
void llama_kv_cache_seq_shift ( struct llama_context * ctx , llama_seq_id seq_id , llama_pos p0 , llama_pos p1 , llama_pos delta ) {
llama_kv_cache_seq_shift ( ctx - > kv_self , seq_id , p0 , p1 , delta ) ;
2023-04-26 22:08:43 +02:00
}
2023-05-03 04:26:13 +02:00
// Returns the *maximum* size of the state
2023-05-01 09:24:20 +02:00
size_t llama_get_state_size ( const struct llama_context * ctx ) {
2023-04-24 06:40:02 +02:00
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
// for reference, std::mt19937(1337) serializes to 6701 bytes.
const size_t s_rng_size = sizeof ( size_t ) ;
const size_t s_rng = LLAMA_MAX_RNG_STATE ;
const size_t s_logits_capacity = sizeof ( size_t ) ;
const size_t s_logits_size = sizeof ( size_t ) ;
const size_t s_logits = ctx - > logits . capacity ( ) * sizeof ( float ) ;
const size_t s_embedding_size = sizeof ( size_t ) ;
const size_t s_embedding = ctx - > embedding . size ( ) * sizeof ( float ) ;
const size_t s_kv_size = sizeof ( size_t ) ;
const size_t s_kv_ntok = sizeof ( int ) ;
2023-06-24 10:47:58 +02:00
const size_t s_kv = ctx - > kv_self . buf . size ;
2023-04-24 06:40:02 +02:00
const size_t s_total = (
+ s_rng_size
+ s_rng
+ s_logits_capacity
+ s_logits_size
+ s_logits
+ s_embedding_size
+ s_embedding
+ s_kv_size
+ s_kv_ntok
+ s_kv
) ;
return s_total ;
2023-04-02 12:23:04 +02:00
}
2023-08-21 22:07:43 +02:00
// llama_context_data
struct llama_data_context {
virtual void write ( const void * src , size_t size ) = 0 ;
virtual size_t get_size_written ( ) = 0 ;
virtual ~ llama_data_context ( ) = default ;
} ;
struct llama_data_buffer_context : llama_data_context {
uint8_t * ptr ;
size_t size_written = 0 ;
llama_data_buffer_context ( uint8_t * p ) : ptr ( p ) { }
void write ( const void * src , size_t size ) override {
memcpy ( ptr , src , size ) ;
ptr + = size ;
size_written + = size ;
}
size_t get_size_written ( ) override {
return size_written ;
}
} ;
struct llama_data_file_context : llama_data_context {
llama_file * file ;
size_t size_written = 0 ;
llama_data_file_context ( llama_file * f ) : file ( f ) { }
void write ( const void * src , size_t size ) override {
file - > write_raw ( src , size ) ;
size_written + = size ;
}
size_t get_size_written ( ) override {
return size_written ;
}
} ;
2023-08-04 13:29:52 +02:00
/** copy state data into either a buffer or file depending on the passed in context
*
* file context :
* llama_file file ( " /path " , " wb " ) ;
* llama_data_file_context data_ctx ( & file ) ;
* llama_copy_state_data ( ctx , & data_ctx ) ;
*
* buffer context :
* std : : vector < uint8_t > buf ( max_size , 0 ) ;
* llama_data_buffer_context data_ctx ( & buf . data ( ) ) ;
* llama_copy_state_data ( ctx , & data_ctx ) ;
*
*/
2023-09-15 21:38:27 +02:00
static void llama_copy_state_data_internal ( struct llama_context * ctx , llama_data_context * data_ctx ) {
2023-04-24 06:40:02 +02:00
// copy rng
{
std : : stringstream rng_ss ;
rng_ss < < ctx - > rng ;
const size_t rng_size = rng_ss . str ( ) . size ( ) ;
char rng_buf [ LLAMA_MAX_RNG_STATE ] ;
memset ( & rng_buf [ 0 ] , 0 , LLAMA_MAX_RNG_STATE ) ;
memcpy ( & rng_buf [ 0 ] , rng_ss . str ( ) . data ( ) , rng_ss . str ( ) . size ( ) ) ;
2023-08-04 13:29:52 +02:00
data_ctx - > write ( & rng_size , sizeof ( rng_size ) ) ;
data_ctx - > write ( & rng_buf [ 0 ] , LLAMA_MAX_RNG_STATE ) ;
2023-04-24 06:40:02 +02:00
}
// copy logits
{
const size_t logits_cap = ctx - > logits . capacity ( ) ;
const size_t logits_size = ctx - > logits . size ( ) ;
2023-08-04 13:29:52 +02:00
data_ctx - > write ( & logits_cap , sizeof ( logits_cap ) ) ;
data_ctx - > write ( & logits_size , sizeof ( logits_size ) ) ;
2023-04-24 06:40:02 +02:00
if ( logits_size ) {
2023-08-04 13:29:52 +02:00
data_ctx - > write ( ctx - > logits . data ( ) , logits_size * sizeof ( float ) ) ;
2023-04-24 06:40:02 +02:00
}
2023-08-04 13:29:52 +02:00
// If there is a gap between the size and the capacity, write padding
size_t padding_size = ( logits_cap - logits_size ) * sizeof ( float ) ;
if ( padding_size > 0 ) {
std : : vector < uint8_t > padding ( padding_size , 0 ) ; // Create a buffer filled with zeros
data_ctx - > write ( padding . data ( ) , padding_size ) ;
}
2023-04-24 06:40:02 +02:00
}
// copy embeddings
{
const size_t embedding_size = ctx - > embedding . size ( ) ;
2023-08-04 13:29:52 +02:00
data_ctx - > write ( & embedding_size , sizeof ( embedding_size ) ) ;
2023-04-24 06:40:02 +02:00
if ( embedding_size ) {
2023-08-04 13:29:52 +02:00
data_ctx - > write ( ctx - > embedding . data ( ) , embedding_size * sizeof ( float ) ) ;
2023-04-24 06:40:02 +02:00
}
}
// copy kv cache
{
2023-06-24 10:47:58 +02:00
const auto & kv_self = ctx - > kv_self ;
2023-05-03 04:26:13 +02:00
const auto & hparams = ctx - > model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = ctx - > cparams ;
2023-10-03 20:04:01 +02:00
const auto n_layer = hparams . n_layer ;
const auto n_embd = hparams . n_embd_gqa ( ) ;
const auto n_ctx = cparams . n_ctx ;
2023-05-03 04:26:13 +02:00
2023-10-03 20:04:01 +02:00
const size_t kv_buf_size = kv_self . buf . size ;
const uint32_t kv_head = kv_self . head ;
const uint32_t kv_size = kv_self . size ;
2023-04-24 06:40:02 +02:00
2023-10-03 20:04:01 +02:00
data_ctx - > write ( & kv_buf_size , sizeof ( kv_buf_size ) ) ;
data_ctx - > write ( & kv_head , sizeof ( kv_head ) ) ;
data_ctx - > write ( & kv_size , sizeof ( kv_size ) ) ;
2023-04-24 06:40:02 +02:00
2023-10-03 20:04:01 +02:00
if ( kv_buf_size ) {
2023-05-03 04:26:13 +02:00
const size_t elt_size = ggml_element_size ( kv_self . k ) ;
2023-05-13 08:08:52 +02:00
2023-06-19 17:20:06 +02:00
ggml_context * cpy_ctx = ggml_init ( { 4096 , NULL , /* no_alloc */ true } ) ;
2023-05-03 04:26:13 +02:00
ggml_cgraph gf { } ;
2023-10-03 20:04:01 +02:00
ggml_tensor * kout3d = ggml_new_tensor_3d ( cpy_ctx , kv_self . k - > type , n_embd , kv_head , n_layer ) ;
2023-08-04 13:29:52 +02:00
std : : vector < uint8_t > kout3d_data ( ggml_nbytes ( kout3d ) , 0 ) ;
kout3d - > data = kout3d_data . data ( ) ;
2023-05-03 04:26:13 +02:00
2023-10-03 20:04:01 +02:00
ggml_tensor * vout3d = ggml_new_tensor_3d ( cpy_ctx , kv_self . v - > type , kv_head , n_embd , n_layer ) ;
2023-08-04 13:29:52 +02:00
std : : vector < uint8_t > vout3d_data ( ggml_nbytes ( vout3d ) , 0 ) ;
vout3d - > data = vout3d_data . data ( ) ;
2023-05-03 04:26:13 +02:00
ggml_tensor * k3d = ggml_view_3d ( cpy_ctx , kv_self . k ,
2023-10-03 20:04:01 +02:00
n_embd , kv_head , n_layer ,
2023-05-03 04:26:13 +02:00
elt_size * n_embd , elt_size * n_embd * n_ctx , 0 ) ;
ggml_tensor * v3d = ggml_view_3d ( cpy_ctx , kv_self . v ,
2023-10-03 20:04:01 +02:00
kv_head , n_embd , n_layer ,
2023-05-03 04:26:13 +02:00
elt_size * n_ctx , elt_size * n_ctx * n_embd , 0 ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( cpy_ctx , k3d , kout3d ) ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( cpy_ctx , v3d , vout3d ) ) ;
2023-07-07 18:24:01 +02:00
ggml_graph_compute_helper ( ctx - > work_buffer , & gf , /*n_threads*/ 1 ) ;
2023-05-13 08:08:52 +02:00
ggml_free ( cpy_ctx ) ;
2023-08-04 13:29:52 +02:00
// our data is now in the kout3d_data and vout3d_data buffers
// write them to file
data_ctx - > write ( kout3d_data . data ( ) , kout3d_data . size ( ) ) ;
data_ctx - > write ( vout3d_data . data ( ) , vout3d_data . size ( ) ) ;
2023-04-24 06:40:02 +02:00
}
2023-10-03 20:04:01 +02:00
for ( uint32_t i = 0 ; i < kv_size ; + + i ) {
const auto & cell = kv_self . cells [ i ] ;
const llama_pos pos = cell . pos ;
const size_t seq_id_size = cell . seq_id . size ( ) ;
data_ctx - > write ( & pos , sizeof ( pos ) ) ;
data_ctx - > write ( & seq_id_size , sizeof ( seq_id_size ) ) ;
for ( auto seq_id : cell . seq_id ) {
data_ctx - > write ( & seq_id , sizeof ( seq_id ) ) ;
}
}
2023-04-24 06:40:02 +02:00
}
2023-08-04 13:29:52 +02:00
}
2023-04-24 06:40:02 +02:00
2023-08-04 13:29:52 +02:00
size_t llama_copy_state_data ( struct llama_context * ctx , uint8_t * dst ) {
llama_data_buffer_context data_ctx ( dst ) ;
llama_copy_state_data_internal ( ctx , & data_ctx ) ;
2023-04-24 06:40:02 +02:00
2023-08-04 13:29:52 +02:00
return data_ctx . get_size_written ( ) ;
2023-04-02 12:23:04 +02:00
}
2023-04-24 06:40:02 +02:00
// Sets the state reading from the specified source address
2023-05-20 09:14:31 +02:00
size_t llama_set_state_data ( struct llama_context * ctx , uint8_t * src ) {
uint8_t * inp = src ;
2023-04-24 06:40:02 +02:00
// set rng
{
size_t rng_size ;
char rng_buf [ LLAMA_MAX_RNG_STATE ] ;
2023-05-13 08:08:52 +02:00
memcpy ( & rng_size , inp , sizeof ( rng_size ) ) ; inp + = sizeof ( rng_size ) ;
memcpy ( & rng_buf [ 0 ] , inp , LLAMA_MAX_RNG_STATE ) ; inp + = LLAMA_MAX_RNG_STATE ;
2023-04-24 06:40:02 +02:00
std : : stringstream rng_ss ;
rng_ss . str ( std : : string ( & rng_buf [ 0 ] , rng_size ) ) ;
rng_ss > > ctx - > rng ;
2023-09-07 19:22:29 +02:00
GGML_ASSERT ( ! rng_ss . fail ( ) ) ;
2023-04-24 06:40:02 +02:00
}
// set logits
{
size_t logits_cap ;
size_t logits_size ;
2023-05-13 08:08:52 +02:00
memcpy ( & logits_cap , inp , sizeof ( logits_cap ) ) ; inp + = sizeof ( logits_cap ) ;
memcpy ( & logits_size , inp , sizeof ( logits_size ) ) ; inp + = sizeof ( logits_size ) ;
2023-04-24 06:40:02 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx - > logits . capacity ( ) = = logits_cap ) ;
2023-04-24 06:40:02 +02:00
if ( logits_size ) {
ctx - > logits . resize ( logits_size ) ;
2023-05-13 08:08:52 +02:00
memcpy ( ctx - > logits . data ( ) , inp , logits_size * sizeof ( float ) ) ;
2023-04-24 06:40:02 +02:00
}
2023-05-13 08:08:52 +02:00
inp + = logits_cap * sizeof ( float ) ;
2023-04-24 06:40:02 +02:00
}
// set embeddings
{
size_t embedding_size ;
2023-05-13 08:08:52 +02:00
memcpy ( & embedding_size , inp , sizeof ( embedding_size ) ) ; inp + = sizeof ( embedding_size ) ;
2023-04-24 06:40:02 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( ctx - > embedding . capacity ( ) = = embedding_size ) ;
2023-04-24 06:40:02 +02:00
if ( embedding_size ) {
2023-05-13 08:08:52 +02:00
memcpy ( ctx - > embedding . data ( ) , inp , embedding_size * sizeof ( float ) ) ;
inp + = embedding_size * sizeof ( float ) ;
2023-04-24 06:40:02 +02:00
}
}
// set kv cache
{
2023-06-24 10:47:58 +02:00
const auto & kv_self = ctx - > kv_self ;
2023-05-03 04:26:13 +02:00
const auto & hparams = ctx - > model . hparams ;
2023-09-28 21:42:38 +02:00
const auto & cparams = ctx - > cparams ;
2023-05-03 04:26:13 +02:00
const int n_layer = hparams . n_layer ;
2023-07-28 10:42:53 +02:00
const int n_embd = hparams . n_embd_gqa ( ) ;
2023-09-28 21:42:38 +02:00
const int n_ctx = cparams . n_ctx ;
2023-05-03 04:26:13 +02:00
2023-10-03 20:04:01 +02:00
size_t kv_buf_size ;
uint32_t kv_head ;
uint32_t kv_size ;
2023-04-24 06:40:02 +02:00
2023-10-03 20:04:01 +02:00
memcpy ( & kv_buf_size , inp , sizeof ( kv_buf_size ) ) ; inp + = sizeof ( kv_buf_size ) ;
memcpy ( & kv_head , inp , sizeof ( kv_head ) ) ; inp + = sizeof ( kv_head ) ;
memcpy ( & kv_size , inp , sizeof ( kv_size ) ) ; inp + = sizeof ( kv_size ) ;
2023-04-24 06:40:02 +02:00
2023-10-03 20:04:01 +02:00
if ( kv_buf_size ) {
GGML_ASSERT ( kv_self . buf . size = = kv_buf_size ) ;
2023-05-03 04:26:13 +02:00
const size_t elt_size = ggml_element_size ( kv_self . k ) ;
2023-05-13 08:08:52 +02:00
2023-06-19 17:20:06 +02:00
ggml_context * cpy_ctx = ggml_init ( { 4096 , NULL , /* no_alloc */ true } ) ;
2023-05-03 04:26:13 +02:00
ggml_cgraph gf { } ;
2023-10-03 20:04:01 +02:00
ggml_tensor * kin3d = ggml_new_tensor_3d ( cpy_ctx , kv_self . k - > type , n_embd , kv_head , n_layer ) ;
2023-05-13 08:08:52 +02:00
kin3d - > data = ( void * ) inp ;
inp + = ggml_nbytes ( kin3d ) ;
2023-04-24 06:40:02 +02:00
2023-10-03 20:04:01 +02:00
ggml_tensor * vin3d = ggml_new_tensor_3d ( cpy_ctx , kv_self . v - > type , kv_head , n_embd , n_layer ) ;
2023-05-13 08:08:52 +02:00
vin3d - > data = ( void * ) inp ;
inp + = ggml_nbytes ( vin3d ) ;
2023-04-24 06:40:02 +02:00
2023-05-03 04:26:13 +02:00
ggml_tensor * k3d = ggml_view_3d ( cpy_ctx , kv_self . k ,
2023-10-03 20:04:01 +02:00
n_embd , kv_head , n_layer ,
2023-05-03 04:26:13 +02:00
elt_size * n_embd , elt_size * n_embd * n_ctx , 0 ) ;
2023-04-24 06:40:02 +02:00
2023-05-03 04:26:13 +02:00
ggml_tensor * v3d = ggml_view_3d ( cpy_ctx , kv_self . v ,
2023-10-03 20:04:01 +02:00
kv_head , n_embd , n_layer ,
2023-05-03 04:26:13 +02:00
elt_size * n_ctx , elt_size * n_ctx * n_embd , 0 ) ;
2023-04-24 06:40:02 +02:00
2023-05-03 04:26:13 +02:00
ggml_build_forward_expand ( & gf , ggml_cpy ( cpy_ctx , kin3d , k3d ) ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( cpy_ctx , vin3d , v3d ) ) ;
2023-07-07 18:24:01 +02:00
ggml_graph_compute_helper ( ctx - > work_buffer , & gf , /*n_threads*/ 1 ) ;
2023-05-13 08:08:52 +02:00
ggml_free ( cpy_ctx ) ;
2023-04-24 06:40:02 +02:00
}
2023-10-03 20:04:01 +02:00
ctx - > kv_self . head = kv_head ;
2023-09-28 18:04:36 +02:00
ctx - > kv_self . size = kv_size ;
2023-10-03 20:04:01 +02:00
ctx - > kv_self . cells . resize ( kv_size ) ;
for ( uint32_t i = 0 ; i < kv_size ; + + i ) {
llama_pos pos ;
size_t seq_id_size ;
memcpy ( & pos , inp , sizeof ( pos ) ) ; inp + = sizeof ( pos ) ;
memcpy ( & seq_id_size , inp , sizeof ( seq_id_size ) ) ; inp + = sizeof ( seq_id_size ) ;
ctx - > kv_self . cells [ i ] . pos = pos ;
llama_seq_id seq_id ;
for ( size_t j = 0 ; j < seq_id_size ; + + j ) {
memcpy ( & seq_id , inp , sizeof ( seq_id ) ) ; inp + = sizeof ( seq_id ) ;
ctx - > kv_self . cells [ i ] . seq_id . insert ( seq_id ) ;
}
}
2023-04-24 06:40:02 +02:00
}
2023-05-13 08:08:52 +02:00
const size_t nread = inp - src ;
2023-05-03 04:26:13 +02:00
const size_t max_size = llama_get_state_size ( ctx ) ;
2023-04-24 06:40:02 +02:00
2023-08-21 22:07:43 +02:00
GGML_ASSERT ( nread < = max_size ) ;
2023-04-24 06:40:02 +02:00
return nread ;
2023-04-02 12:23:04 +02:00
}
2023-07-01 18:02:58 +02:00
static bool llama_load_session_file_internal ( struct llama_context * ctx , const char * path_session , llama_token * tokens_out , size_t n_token_capacity , size_t * n_token_count_out ) {
2023-05-01 13:54:59 +02:00
llama_file file ( path_session , " rb " ) ;
// sanity checks
{
const uint32_t magic = file . read_u32 ( ) ;
const uint32_t version = file . read_u32 ( ) ;
2023-05-13 10:23:15 +02:00
if ( magic ! = LLAMA_SESSION_MAGIC | | version ! = LLAMA_SESSION_VERSION ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s : unknown (magic, version) for session file: %08x, %08x \n " , __func__ , magic , version ) ;
2023-05-01 13:54:59 +02:00
return false ;
}
llama_hparams session_hparams ;
file . read_raw ( & session_hparams , sizeof ( llama_hparams ) ) ;
if ( session_hparams ! = ctx - > model . hparams ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s : model hparams didn't match from session file! \n " , __func__ ) ;
2023-05-01 13:54:59 +02:00
return false ;
}
}
// load the prompt
{
const uint32_t n_token_count = file . read_u32 ( ) ;
if ( n_token_count > n_token_capacity ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s : token count in session file exceeded capacity! %u > %zu \n " , __func__ , n_token_count , n_token_capacity ) ;
2023-05-01 13:54:59 +02:00
return false ;
}
file . read_raw ( tokens_out , sizeof ( llama_token ) * n_token_count ) ;
* n_token_count_out = n_token_count ;
}
// restore the context state
{
const size_t n_state_size_cur = file . size - file . tell ( ) ;
2023-05-03 04:26:13 +02:00
const size_t n_state_size_max = llama_get_state_size ( ctx ) ;
2023-05-01 13:54:59 +02:00
2023-05-03 04:26:13 +02:00
if ( n_state_size_cur > n_state_size_max ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " %s : the state size in session file is too big! max %zu, got %zu \n " , __func__ , n_state_size_max , n_state_size_cur ) ;
2023-05-01 13:54:59 +02:00
return false ;
}
2023-05-03 04:26:13 +02:00
std : : vector < uint8_t > state_data ( n_state_size_max ) ;
2023-05-01 13:54:59 +02:00
file . read_raw ( state_data . data ( ) , n_state_size_cur ) ;
llama_set_state_data ( ctx , state_data . data ( ) ) ;
}
2023-07-01 18:05:09 +02:00
return true ;
2023-07-01 18:02:58 +02:00
}
2023-05-01 13:54:59 +02:00
2023-07-01 18:02:58 +02:00
bool llama_load_session_file ( struct llama_context * ctx , const char * path_session , llama_token * tokens_out , size_t n_token_capacity , size_t * n_token_count_out ) {
try {
return llama_load_session_file_internal ( ctx , path_session , tokens_out , n_token_capacity , n_token_count_out ) ;
} catch ( const std : : exception & err ) {
2023-08-09 22:46:40 +02:00
LLAMA_LOG_ERROR ( " error loading session file: %s \n " , err . what ( ) ) ;
2023-07-01 18:02:58 +02:00
return false ;
}
2023-05-01 13:54:59 +02:00
}
bool llama_save_session_file ( struct llama_context * ctx , const char * path_session , const llama_token * tokens , size_t n_token_count ) {
llama_file file ( path_session , " wb " ) ;
file . write_u32 ( LLAMA_SESSION_MAGIC ) ;
file . write_u32 ( LLAMA_SESSION_VERSION ) ;
file . write_raw ( & ctx - > model . hparams , sizeof ( llama_hparams ) ) ;
// save the prompt
file . write_u32 ( ( uint32_t ) n_token_count ) ;
file . write_raw ( tokens , sizeof ( llama_token ) * n_token_count ) ;
2023-08-04 13:29:52 +02:00
// save the context state using stream saving
llama_data_file_context data_ctx ( & file ) ;
llama_copy_state_data_internal ( ctx , & data_ctx ) ;
2023-05-01 13:54:59 +02:00
return true ;
}
2023-03-22 06:32:36 +01:00
int llama_eval (
struct llama_context * ctx ,
2023-09-28 18:04:36 +02:00
llama_token * tokens ,
int32_t n_tokens ,
2023-09-28 21:42:38 +02:00
int n_past ) {
2023-09-28 18:04:36 +02:00
llama_kv_cache_tokens_rm ( ctx - > kv_self , n_past , - 1 ) ;
2023-06-28 17:53:37 +02:00
2023-09-28 21:42:38 +02:00
const int ret = llama_decode_internal ( * ctx , llama_batch_get_one ( tokens , n_tokens , n_past , 0 ) ) ;
2023-09-28 18:04:36 +02:00
if ( ret < 0 ) {
LLAMA_LOG_ERROR ( " %s: failed to decode, ret = %d \n " , __func__ , ret ) ;
2023-06-28 17:53:37 +02:00
}
2023-09-28 18:04:36 +02:00
return ret ;
2023-06-28 17:53:37 +02:00
}
int llama_eval_embd (
struct llama_context * ctx ,
2023-09-28 18:04:36 +02:00
float * embd ,
int32_t n_tokens ,
2023-09-28 21:42:38 +02:00
int n_past ) {
2023-09-28 18:04:36 +02:00
llama_kv_cache_tokens_rm ( ctx - > kv_self , n_past , - 1 ) ;
2023-05-08 16:41:54 +02:00
2023-09-28 18:04:36 +02:00
llama_batch batch = { n_tokens , nullptr , embd , nullptr , nullptr , nullptr , n_past , 1 , 0 , } ;
2023-09-28 21:42:38 +02:00
const int ret = llama_decode_internal ( * ctx , batch ) ;
2023-09-28 18:04:36 +02:00
if ( ret < 0 ) {
LLAMA_LOG_ERROR ( " %s: failed to decode, ret = %d \n " , __func__ , ret ) ;
2023-03-29 22:51:37 +02:00
}
2023-05-08 16:41:54 +02:00
2023-09-28 18:04:36 +02:00
return ret ;
2023-03-22 06:32:36 +01:00
}
2023-09-28 21:42:38 +02:00
void llama_set_n_threads ( struct llama_context * ctx , uint32_t n_threads , uint32_t n_threads_batch ) {
ctx - > cparams . n_threads = n_threads ;
ctx - > cparams . n_threads_batch = n_threads_batch ;
}
2023-09-28 18:04:36 +02:00
struct llama_batch llama_batch_get_one (
llama_token * tokens ,
int32_t n_tokens ,
llama_pos pos_0 ,
llama_seq_id seq_id ) {
return {
/*n_tokens =*/ n_tokens ,
/*tokens =*/ tokens ,
/*embd =*/ nullptr ,
/*pos =*/ nullptr ,
/*seq_id =*/ nullptr ,
/*logits =*/ nullptr ,
/*all_pos_0 =*/ pos_0 ,
/*all_pos_1 =*/ 1 ,
/*all_seq_id =*/ seq_id ,
} ;
}
2023-06-04 22:34:30 +02:00
2023-09-28 18:04:36 +02:00
struct llama_batch llama_batch_init ( int32_t n_tokens , int32_t embd ) {
llama_batch batch = { - 1 , nullptr , nullptr , nullptr , nullptr , nullptr , 0 , 0 , 0 , } ;
2023-06-04 22:34:30 +02:00
2023-09-28 18:04:36 +02:00
if ( embd ) {
batch . embd = ( float * ) malloc ( sizeof ( float ) * n_tokens * embd ) ;
} else {
batch . token = ( llama_token * ) malloc ( sizeof ( llama_token ) * n_tokens ) ;
2023-06-04 22:34:30 +02:00
}
2023-09-28 18:04:36 +02:00
batch . pos = ( llama_pos * ) malloc ( sizeof ( llama_pos ) * n_tokens ) ;
batch . seq_id = ( llama_seq_id * ) malloc ( sizeof ( llama_seq_id ) * n_tokens ) ;
batch . logits = ( int8_t * ) malloc ( sizeof ( int8_t ) * n_tokens ) ;
return batch ;
}
void llama_batch_free ( struct llama_batch batch ) {
if ( batch . token ) free ( batch . token ) ;
if ( batch . embd ) free ( batch . embd ) ;
if ( batch . pos ) free ( batch . pos ) ;
if ( batch . seq_id ) free ( batch . seq_id ) ;
if ( batch . logits ) free ( batch . logits ) ;
}
int llama_decode (
struct llama_context * ctx ,
2023-09-28 21:42:38 +02:00
struct llama_batch batch ) {
const int ret = llama_decode_internal ( * ctx , batch ) ;
2023-09-28 18:04:36 +02:00
if ( ret < 0 ) {
LLAMA_LOG_ERROR ( " %s: failed to decode, ret = %d \n " , __func__ , ret ) ;
}
return ret ;
2023-06-04 22:34:30 +02:00
}
2023-08-21 22:07:43 +02:00
float * llama_get_logits ( struct llama_context * ctx ) {
return ctx - > logits . data ( ) ;
2023-03-22 06:32:36 +01:00
}
2023-09-28 18:04:36 +02:00
float * llama_get_logits_ith ( struct llama_context * ctx , int32_t i ) {
return ctx - > logits . data ( ) + i * ctx - > model . hparams . n_vocab ;
}
2023-08-21 22:07:43 +02:00
float * llama_get_embeddings ( struct llama_context * ctx ) {
return ctx - > embedding . data ( ) ;
2023-07-14 20:55:24 +02:00
}
2023-08-21 22:07:43 +02:00
const char * llama_token_get_text ( const struct llama_context * ctx , llama_token token ) {
return ctx - > model . vocab . id_to_token [ token ] . text . c_str ( ) ;
2023-07-14 20:55:24 +02:00
}
2023-08-21 22:07:43 +02:00
float llama_token_get_score ( const struct llama_context * ctx , llama_token token ) {
return ctx - > model . vocab . id_to_token [ token ] . score ;
2023-07-14 20:55:24 +02:00
}
2023-08-21 22:07:43 +02:00
llama_token_type llama_token_get_type ( const struct llama_context * ctx , llama_token token ) {
return ctx - > model . vocab . id_to_token [ token ] . type ;
2023-07-14 20:55:24 +02:00
}
2023-08-21 22:07:43 +02:00
llama_token llama_token_bos ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_bos_id ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
llama_token llama_token_eos ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_eos_id ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
llama_token llama_token_nl ( const struct llama_context * ctx ) {
return ctx - > model . vocab . linefeed_id ;
2023-03-25 19:51:14 +01:00
}
2023-10-02 09:42:02 +02:00
llama_token llama_token_prefix ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_prefix_id ;
}
llama_token llama_token_middle ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_middle_id ;
}
llama_token llama_token_suffix ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_suffix_id ;
}
llama_token llama_token_eot ( const struct llama_context * ctx ) {
return ctx - > model . vocab . special_eot_id ;
}
2023-03-25 19:51:14 +01:00
2023-08-21 22:07:43 +02:00
int llama_tokenize (
const struct llama_model * model ,
const char * text ,
2023-09-16 13:41:33 +02:00
int text_len ,
2023-08-21 22:07:43 +02:00
llama_token * tokens ,
int n_max_tokens ,
bool add_bos ) {
2023-09-16 13:41:33 +02:00
auto res = llama_tokenize_internal ( model - > vocab , std : : string ( text , text_len ) , add_bos ) ;
2023-08-21 22:07:43 +02:00
if ( n_max_tokens < ( int ) res . size ( ) ) {
2023-09-13 15:19:44 +02:00
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
2023-08-21 22:07:43 +02:00
return - ( ( int ) res . size ( ) ) ;
2023-03-22 06:32:36 +01:00
}
2023-08-21 22:07:43 +02:00
for ( size_t i = 0 ; i < res . size ( ) ; i + + ) {
tokens [ i ] = res [ i ] ;
}
2023-07-14 20:55:24 +02:00
2023-08-21 22:07:43 +02:00
return res . size ( ) ;
2023-03-22 06:32:36 +01:00
}
2023-10-03 09:16:26 +02:00
static std : : string llama_decode_text ( const std : : string & text ) {
std : : string decoded_text ;
auto unicode_sequences = codepoints_from_utf8 ( text ) ;
for ( auto & unicode_sequence : unicode_sequences ) {
decoded_text + = unicode_to_bytes_bpe ( codepoint_to_utf8 ( unicode_sequence ) ) ;
}
return decoded_text ;
}
2023-08-27 13:19:19 +02:00
// does not write null-terminator to buf
2023-09-28 21:42:38 +02:00
int llama_token_to_piece ( const struct llama_model * model , llama_token token , char * buf , int length ) {
if ( 0 < = token & & token < llama_n_vocab ( model ) ) {
2023-10-03 09:16:26 +02:00
switch ( llama_vocab_get_type ( model - > vocab ) ) {
case LLAMA_VOCAB_TYPE_SPM : {
if ( llama_is_normal_token ( model - > vocab , token ) ) {
std : : string result = model - > vocab . id_to_token [ token ] . text ;
2023-08-24 11:26:01 +02:00
llama_unescape_whitespace ( result ) ;
2023-10-03 09:16:26 +02:00
if ( length < ( int ) result . length ( ) ) {
return - result . length ( ) ;
}
memcpy ( buf , result . c_str ( ) , result . length ( ) ) ;
return result . length ( ) ;
} else if ( llama_is_unknown_token ( model - > vocab , token ) ) { // NOLINT
if ( length < 3 ) {
return - 3 ;
}
memcpy ( buf , " \xe2 \x96 \x85 " , 3 ) ;
return 3 ;
} else if ( llama_is_control_token ( model - > vocab , token ) ) {
;
} else if ( llama_is_byte_token ( model - > vocab , token ) ) {
if ( length < 1 ) {
return - 1 ;
}
buf [ 0 ] = llama_token_to_byte ( model - > vocab , token ) ;
return 1 ;
} else {
GGML_ASSERT ( false ) ;
2023-08-21 22:07:43 +02:00
}
2023-10-03 09:16:26 +02:00
break ;
}
case LLAMA_VOCAB_TYPE_BPE : {
if ( llama_is_normal_token ( model - > vocab , token ) ) {
std : : string result = model - > vocab . id_to_token [ token ] . text ;
result = llama_decode_text ( result ) ;
if ( length < ( int ) result . length ( ) ) {
return - result . length ( ) ;
}
memcpy ( buf , result . c_str ( ) , result . length ( ) ) ;
return result . length ( ) ;
} else if ( llama_is_control_token ( model - > vocab , token ) ) {
;
} else {
GGML_ASSERT ( false ) ;
2023-08-21 22:07:43 +02:00
}
2023-10-03 09:16:26 +02:00
break ;
}
default :
GGML_ASSERT ( false ) ;
2023-08-21 22:07:43 +02:00
}
}
return 0 ;
2023-03-22 06:32:36 +01:00
}
2023-07-05 22:51:13 +02:00
struct llama_timings llama_get_timings ( struct llama_context * ctx ) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx - > t_start_us ,
/*.t_end_ms =*/ 1.00 * ggml_time_ms ( ) ,
/*.t_load_ms =*/ 1e-3 * ctx - > t_load_us ,
/*.t_sample_ms =*/ 1e-3 * ctx - > t_sample_us ,
/*.t_p_eval_ms =*/ 1e-3 * ctx - > t_p_eval_us ,
/*.t_eval_ms =*/ 1e-3 * ctx - > t_eval_us ,
/*.n_sample =*/ std : : max ( 1 , ctx - > n_sample ) ,
/*.n_p_eval =*/ std : : max ( 1 , ctx - > n_p_eval ) ,
/*.n_eval =*/ std : : max ( 1 , ctx - > n_eval ) ,
} ;
2023-03-22 06:32:36 +01:00
2023-07-05 22:51:13 +02:00
return result ;
}
2023-03-22 06:32:36 +01:00
2023-07-05 22:51:13 +02:00
void llama_print_timings ( struct llama_context * ctx ) {
const llama_timings timings = llama_get_timings ( ctx ) ;
2023-03-22 06:32:36 +01:00
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " \n " ) ;
LLAMA_LOG_INFO ( " %s: load time = %8.2f ms \n " , __func__ , timings . t_load_ms ) ;
LLAMA_LOG_INFO ( " %s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second) \n " ,
2023-07-05 22:51:13 +02:00
__func__ , timings . t_sample_ms , timings . n_sample , timings . t_sample_ms / timings . n_sample , 1e3 / timings . t_sample_ms * timings . n_sample ) ;
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second) \n " ,
2023-07-05 22:51:13 +02:00
__func__ , timings . t_p_eval_ms , timings . n_p_eval , timings . t_p_eval_ms / timings . n_p_eval , 1e3 / timings . t_p_eval_ms * timings . n_p_eval ) ;
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second) \n " ,
2023-07-05 22:51:13 +02:00
__func__ , timings . t_eval_ms , timings . n_eval , timings . t_eval_ms / timings . n_eval , 1e3 / timings . t_eval_ms * timings . n_eval ) ;
2023-08-09 22:46:40 +02:00
LLAMA_LOG_INFO ( " %s: total time = %8.2f ms \n " , __func__ , ( timings . t_end_ms - timings . t_start_ms ) ) ;
2023-03-22 06:32:36 +01:00
}
void llama_reset_timings ( struct llama_context * ctx ) {
ctx - > t_start_us = ggml_time_us ( ) ;
ctx - > t_sample_us = ctx - > n_sample = 0 ;
ctx - > t_eval_us = ctx - > n_eval = 0 ;
2023-03-25 15:34:23 +01:00
ctx - > t_p_eval_us = ctx - > n_p_eval = 0 ;
2023-03-22 06:32:36 +01:00
}
const char * llama_print_system_info ( void ) {
static std : : string s ;
s = " " ;
2023-04-17 15:10:57 +02:00
s + = " AVX = " + std : : to_string ( ggml_cpu_has_avx ( ) ) + " | " ;
s + = " AVX2 = " + std : : to_string ( ggml_cpu_has_avx2 ( ) ) + " | " ;
s + = " AVX512 = " + std : : to_string ( ggml_cpu_has_avx512 ( ) ) + " | " ;
s + = " AVX512_VBMI = " + std : : to_string ( ggml_cpu_has_avx512_vbmi ( ) ) + " | " ;
s + = " AVX512_VNNI = " + std : : to_string ( ggml_cpu_has_avx512_vnni ( ) ) + " | " ;
s + = " FMA = " + std : : to_string ( ggml_cpu_has_fma ( ) ) + " | " ;
s + = " NEON = " + std : : to_string ( ggml_cpu_has_neon ( ) ) + " | " ;
s + = " ARM_FMA = " + std : : to_string ( ggml_cpu_has_arm_fma ( ) ) + " | " ;
s + = " F16C = " + std : : to_string ( ggml_cpu_has_f16c ( ) ) + " | " ;
s + = " FP16_VA = " + std : : to_string ( ggml_cpu_has_fp16_va ( ) ) + " | " ;
s + = " WASM_SIMD = " + std : : to_string ( ggml_cpu_has_wasm_simd ( ) ) + " | " ;
s + = " BLAS = " + std : : to_string ( ggml_cpu_has_blas ( ) ) + " | " ;
s + = " SSE3 = " + std : : to_string ( ggml_cpu_has_sse3 ( ) ) + " | " ;
2023-08-27 10:10:25 +02:00
s + = " SSSE3 = " + std : : to_string ( ggml_cpu_has_ssse3 ( ) ) + " | " ;
2023-04-17 15:10:57 +02:00
s + = " VSX = " + std : : to_string ( ggml_cpu_has_vsx ( ) ) + " | " ;
2023-03-22 06:32:36 +01:00
return s . c_str ( ) ;
}
2023-04-08 00:09:18 +02:00
2023-08-28 17:59:39 +02:00
void llama_dump_timing_info_yaml ( FILE * stream , const llama_context * ctx ) {
fprintf ( stream , " \n " ) ;
fprintf ( stream , " ########### \n " ) ;
fprintf ( stream , " # Timings # \n " ) ;
fprintf ( stream , " ########### \n " ) ;
fprintf ( stream , " \n " ) ;
fprintf ( stream , " mst_eval: %.2f # ms / token during generation \n " ,
1.0e-3 * ctx - > t_eval_us / ctx - > n_eval ) ;
fprintf ( stream , " mst_p_eval: %.2f # ms / token during prompt processing \n " ,
1.0e-3 * ctx - > t_p_eval_us / ctx - > n_p_eval ) ;
fprintf ( stream , " mst_sample: %.2f # ms / token during sampling \n " ,
1.0e-3 * ctx - > t_sample_us / ctx - > n_sample ) ;
fprintf ( stream , " n_eval: %d # number of tokens generated (excluding the first one) \n " , ctx - > n_eval ) ;
fprintf ( stream , " n_p_eval: %d # number of tokens processed in batches at the beginning \n " , ctx - > n_p_eval ) ;
fprintf ( stream , " n_sample: %d # number of sampled tokens \n " , ctx - > n_sample ) ;
train : mem usage and other improvements (#2439)
* fix track_max_mem in forward_batch_wo_cache_flash_attn_train
* remove unnecessary Adam(W) optimizer tensors.
reduces optimizer memory overhead from 7*modelsize to 2*modelsize.
additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.
bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
* add gradient clipping to AdamW
* Fix reset of unused g->nodes and g->grads to NULL
* implement gradient checkpointing for training
reduces memory overhead from O(n_layer) to O(sqrt(n_layer))
as explained in readme of https://github.com/cybertronai/gradient-checkpointing
* remove unused compute buffer 3
* add and use function ggml_build_backward_expand to avoid stack overflows with large maximum number of nodes
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
* change AdamW decay parameter to work like the torch AdamW decay parameter
It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.
`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
* change default AdamW weight decay parameter used in training to 0.1 as used in nanoGPT
* change default AdamW weight decay parameter defined in ggml to 0.0, making Adam default instead of AdamW
btw: the default weight decay parameter for torch.optim.AdamW is 0.01
* bug fixes for cross entropy loss
ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues
guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16
cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
* fix test-grad0 for cross_entropy_loss
the second argument to cross_entropy_loss must sum up to 1 for each row
* fix test-grad0 for soft_max
dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
* improve finite differences of test-grad0 by using double instead of float
* change cross_entropy_loss to output average over all rows
this helps keeping the loss and gradients in a sane range
* improve gradient checkpointing
sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:
```
given: n, u, v
objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
b=n/a
minimize(a*u+v*n/a)
diff(a*u+v*n/a, a) = u - (v*n/a)/a
diff(a*u+v*n/a, a) == 0
u - (v*n/a)/a == 0
u == v*n/(a*a)
u*a*a = v*n
a*a = v*n/u
a = sqrt(n*v/u)
```
this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
* disable gradient checkpointing debug output
* llama : fix rope usage in train-text-from-scratch after ChatGLM change
* add more training parameters:
--enable-restart N Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N Adam minimum learning rate alpha, usually 0.1 * alpha
* replace memcpy with reshape operation so that the graph is not cut at the input
this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
* remove unused function argument from get_example_targets_batch
* measure and print total training time
* add optimization callback to ggml_opt_resume_g
this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).
can be used for dynamic learning schedule and setting input data for batches before each iteration
* use optimization callback in training
allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters
reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
* add minimum number of tensor dimensions to apply weight decay (default 2)
this allows to not apply weight decay to bias parameters
* rename training parameter cos-decay-alpha to cos-decay-min and clarify that adam-min-alpha also applies to warmup
* fix increase of model.train_samples and model.train_tokens
now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
* change sampling parameters for prediction after training to defaults of common.h
and clarify what is context for prediction and what are generated tokens
* tighten abs error bounds for cross_entropy_loss in test-grad0
* add conditional compilation of using F16 exp in flash attention
uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
* tighten abs error bounds for flash_attn in test-grad0
* tighten abs error bounds for sqrt in test-grad0
* remove out-commented vectorized code of opt_adam
the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
* ggml : update ggml_rms_norm_back with configurable eps
* llama training : fix ggml_rms_norm_back calls to pass configurable eps
* remove trailing whitespace
* add train function using automatic gradient checkpointing backward pass and allocator
* in train function replace add_inplace by regular add
because using add_inplace seems to result in different gradients
* don't use allocate hash_map on context
because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
* correctly clone reshape and permute operations by also cloning tensor->nb values
* fix variable name and add missing type cast
* terminate recursive tensor cloning when reaching tensor without src tensors
* correctly clone view tensors by setting data pointers
without this the checkpointing would only work when being used together with memory allocator
* fix variable names
* swap arguments to commutative ops to be the same as in `forward_batch_wo_cache_flash_attn`
* add input tensors as checkpoints
so that recursive tensor cloning of gradient checkpointing terminates on input tensors
* fix variable name and add missing boolean negation
* make sure some tensors are not reallocated by inserting new temporary nodes depending on them:
output and parameter gradient tensors need to be available at the end of the graph execution
parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration
checkpoint tensors are allocated all together to reduce memory allocator fragmentation
afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
* fix ASSERT to work with zero layers
* add training options whether to use allocator and/or unified training function
* integrate unified training function which may use memory allocator
the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
* format name of cloned tensors with " (clone)" suffix
* set names for tensors in unified train function for easier debugging
* allocate graph on context using ggml_new_graph
* remove handwritten training functions
* remove unused training parameters "use_scratch" and "use_unified"
* remove trailing whitespace
* remove unused train params: mem_compute1_gb & mem_compute2_gb
mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
* remove unused forward_batch function
* add debug asserts in ggml_allocr_alloc to some common pitfalls when using this function directly
* only use ggml_allocr_alloc when tensor has NULL data and is no view
* fix test when to create temporary backward graph
temporary backward graph is only necessary when using checkpointing
* fix memory "leak" in optimizers
each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
* reverse order of for loop in ggml_build_backward_expand to save memory when using gradient checkpointing and allocator
with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.
the computation results are the same
* add missing lctx argument to get_example_targets_batch
* implement llama model file saving using gguf
checkpoint loading and saving disabled, to be replaced by loading and saving via gguf
* implement loading/saving of checkpointing files using GGUF
* bug fixes
* add checkpoint file version for future compatibility
* update readme with gguf filenames
* save & load opt->just_initialized value
* add first draft for checkpoint conversion script
* add gguf arch and ftype
* save opt parameter counter as uint64
* add gguf key and tensor names for optimizer and training
* add layer_norm_rms_eps to checkpoint convert script
* use same GGUF_GET_KEY macro as in llama.cpp
* use norm_rms_eps, and rope parameters and command line options to set them
* fix memory corruption bug in gguf
ctx->kv and ctx->infos was reallocated using not-aligned realloc, but freed with aligned free.
to fix this a GGML_ALIGNED_REALLOC was added, but there is no posix_memalign_realloc function.
so on non-windows and non-mingw32 platforms we fall back to aligned malloc, followed by copying
and freeing the old data.
* add gguf example cmake file
* bug fixes in tokenize_file
* bug fixes in load_llama_model_gguf
* bug fix: init model when no checkpoint was loaded
* bug fix in read_tensor_by_name
* bug fix in load_opt_context_gguf
* avoid printing lots of spaced on the unusual case that loss gets nan
* set name of tensors with empty name from what was read from gguf
* remove trailing whitespace
* print data checksums before saving and after loading to verify correctness
* bug fixes for convert-train-checkpoint-to-gguf
* temporarily add code to write old checkpoint files
used to verify that old checkpoint files are correctly converted to gguf
* bug fixes for convert-train-checkpoint-to-gguf.py loading checkpoints with opt_version=0
* remove code used to verify correctness of checkpoint file conversion
* remove trailing whitespace
* remove prediction related code
use main for prediction, it is better optimized
* update train-text-from-scratch README.md
* fix non-windows GGML_ALIGNED_REALLOC
* add missing blank line at end of file
* remove GGML_ALIGNED_REALLOC and use normal malloc/realloc/free for gguf ctx->kv & ctx->infos
* train : fix compile warnings
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-08-28 21:51:47 +02:00
fprintf ( stream , " t_eval_us: % " PRId64 " # total microseconds spent generating tokens \n " , ctx - > t_eval_us ) ;
fprintf ( stream , " t_load_us: % " PRId64 " # total microseconds spent loading the model \n " , ctx - > t_load_us ) ;
fprintf ( stream , " t_p_eval_us: % " PRId64 " # total microseconds spent prompt processing \n " , ctx - > t_p_eval_us ) ;
fprintf ( stream , " t_sample_us: % " PRId64 " # total microseconds spent sampling \n " , ctx - > t_sample_us ) ;
2023-08-28 17:59:39 +02:00
fprintf ( stream , " ts_eval: %.2f # tokens / second during generation \n " ,
1.0e6 * ctx - > n_eval / ctx - > t_eval_us ) ;
fprintf ( stream , " ts_p_eval: %.2f # tokens / second during prompt processing \n " ,
1.0e6 * ctx - > n_p_eval / ctx - > t_p_eval_us ) ;
fprintf ( stream , " ts_sample: %.2f # tokens / second during sampling \n " ,
1.0e6 * ctx - > n_sample / ctx - > t_sample_us ) ;
}
2023-04-08 00:09:18 +02:00
// For internal test use
2023-09-15 21:38:27 +02:00
const std : : vector < std : : pair < std : : string , struct ggml_tensor * > > & llama_internal_get_tensor_map (
struct llama_context * ctx
) {
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 21:24:37 +02:00
return ctx - > model . tensors_by_name ;
2023-04-08 00:09:18 +02:00
}
2023-08-09 22:46:40 +02:00
2023-09-27 17:48:33 +02:00
void llama_log_set ( ggml_log_callback log_callback , void * user_data ) {
2023-08-09 22:46:40 +02:00
g_state . log_callback = log_callback ? log_callback : llama_log_callback_default ;
g_state . log_callback_user_data = user_data ;
}
2023-09-27 17:48:33 +02:00
static void llama_log_internal_v ( ggml_log_level level , const char * format , va_list args ) {
2023-08-09 22:46:40 +02:00
va_list args_copy ;
va_copy ( args_copy , args ) ;
char buffer [ 128 ] ;
int len = vsnprintf ( buffer , 128 , format , args ) ;
if ( len < 128 ) {
g_state . log_callback ( level , buffer , g_state . log_callback_user_data ) ;
} else {
char * buffer2 = new char [ len + 1 ] ;
vsnprintf ( buffer2 , len + 1 , format , args_copy ) ;
buffer2 [ len ] = 0 ;
g_state . log_callback ( level , buffer2 , g_state . log_callback_user_data ) ;
delete [ ] buffer2 ;
}
va_end ( args_copy ) ;
}
2023-09-27 17:48:33 +02:00
static void llama_log_internal ( ggml_log_level level , const char * format , . . . ) {
2023-08-09 22:46:40 +02:00
va_list args ;
va_start ( args , format ) ;
llama_log_internal_v ( level , format , args ) ;
va_end ( args ) ;
}
2023-09-27 17:48:33 +02:00
static void llama_log_callback_default ( ggml_log_level level , const char * text , void * user_data ) {
2023-08-09 22:46:40 +02:00
( void ) level ;
( void ) user_data ;
fputs ( text , stderr ) ;
fflush ( stderr ) ;
}