2023-04-20 03:14:14 +02:00
# Define the default target now so that it is always the first target
2023-09-04 12:39:57 +02:00
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
2023-05-27 19:04:14 +02:00
2023-07-21 12:09:16 +02:00
# Binaries only useful for tests
2023-08-28 17:38:35 +02:00
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
2023-07-21 12:09:16 +02:00
2023-09-03 10:48:49 +02:00
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
2023-05-27 19:04:14 +02:00
default : $( BUILD_TARGETS )
2023-04-20 03:14:14 +02:00
2023-08-30 11:42:51 +02:00
test :
@echo "Running tests..."
@for test_target in $( TEST_TARGETS) ; do \
if [ " $$ test_target " = "tests/test-tokenizer-0-llama" ] ; then \
./$$ test_target $( CURDIR) /models/ggml-vocab-llama.gguf; \
elif [ " $$ test_target " = "tests/test-tokenizer-0-falcon" ] ; then \
continue ; \
elif [ " $$ test_target " = "tests/test-tokenizer-1" ] ; then \
continue ; \
else \
./$$ test_target; \
fi ; \
done
@echo "All tests have been run."
all : $( BUILD_TARGETS ) $( TEST_TARGETS )
2023-09-03 10:48:49 +02:00
coverage : ## Run code coverage
gcov -pb tests/*.cpp
lcov-report : coverage ## Generate lcov report
mkdir -p lcov-report
lcov --capture --directory . --output-file lcov-report/coverage.info
genhtml lcov-report/coverage.info --output-directory lcov-report
gcovr-report : coverage ## Generate gcovr report
mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html
2023-03-10 19:40:58 +01:00
i f n d e f U N A M E _ S
UNAME_S := $( shell uname -s)
e n d i f
i f n d e f U N A M E _ P
UNAME_P := $( shell uname -p)
e n d i f
i f n d e f U N A M E _ M
UNAME_M := $( shell uname -m)
e n d i f
2023-09-01 15:27:40 +02:00
i f d e f R I S C V _ C R O S S _ C O M P I L E
CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++
e n d i f
2023-03-10 19:40:58 +01:00
CCV := $( shell $( CC) --version | head -n 1)
CXXV := $( shell $( CXX) --version | head -n 1)
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
i f e q ( $( UNAME_S ) , D a r w i n )
ifneq ( $( UNAME_P) ,arm)
2023-03-21 16:44:11 +01:00
SYSCTL_M := $( shell sysctl -n hw.optional.arm64 2>/dev/null)
2023-03-10 19:40:58 +01:00
ifeq ( $( SYSCTL_M) ,1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $( warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\# issuecomment-1282546789)
endif
endif
e n d i f
#
# Compile flags
#
2023-03-21 16:29:41 +01:00
# keep standard at C11 and C++11
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
# -Ofast tends to produce faster code, but may not be available for some compilers.
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 18:43:07 +02:00
i f d e f L L A M A _ F A S T
OPT = -Ofast
e l s e
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
* Starting to add k-quantization to ggml
I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.
* Adding Q3_K and Q8_K (de)-quantization
* Q3_K now working on CUDA and AVX2/scalar
CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).
* Some improvement for Q3_K on CUDA
It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.
* Some more CUDA optimizations for Q3_K
Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.
* Adding Q4_K - scalar, AVX2, CUDA
Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).
* Adding Q6_K - scalar, AVX2, CUDA
Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).
* Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.
* Per convention, all QX_K quantizations use Q5_K for output.weight
* Adding quantization mixes
* Quantization mixes: didn't quite get what I wanted in the last commit
* Q4_K dot product for ARM_NEON
* Q6_K dot product for ARM_NEON
* Q5_K dot product for ARM_NEON
* Adding Q3_K dot for ARM_NEON
It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.
* A very slightly faster ARM_NEON Q3_K dot
* Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.
* Adding scalar and AVX2 Q2_K dot
* Adding ARM_NEON Q2_K dot
About the same performance as Q4_K.
* A slightly faster ARM_NEON Q2_K dot
Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.
* Fixed bug in Q2_K CUDA dot product kernel
Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.
In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).
* Don't print zeros/NaNs when no count histogram has been collected
* A 10% faster CUDA vector dot kernel for Q3_K
Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.
* A slightly daster Q4_K AVX2 dot product
For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.
* A slightly faster ARM_NEON A4_K dot product
* Minor
* Fix quantization error test
We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.
* Fix docker build
I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.
* Added forgotten ggml.o dependence on k_quants.h to the Makefile
* Had unintentionally committed the Makefile with -Ofast enabled
* ggml : rename k_quants -> ggml-quants-k, use lowercase in code
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 21:56:18 +02:00
OPT = -O3
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 18:43:07 +02:00
e n d i f
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $( CPPFLAGS) $( OPT) -std= c11 -fPIC
MK_CXXFLAGS = $( CPPFLAGS) $( OPT) -std= c++11 -fPIC
MK_LDFLAGS =
2023-03-10 19:40:58 +01:00
2023-05-28 21:01:02 +02:00
i f d e f L L A M A _ D E B U G
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g
2023-05-28 21:01:02 +02:00
e l s e
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DNDEBUG
2023-04-29 17:43:28 +02:00
e n d i f
2023-07-04 14:38:04 +02:00
i f d e f L L A M A _ S E R V E R _ V E R B O S E
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DSERVER_VERBOSE= $( LLAMA_SERVER_VERBOSE)
2023-07-04 14:38:04 +02:00
e n d i f
2023-09-03 10:48:49 +02:00
i f d e f L L A M A _ C O D E _ C O V E R A G E
CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
e n d i f
2023-09-01 11:07:06 +02:00
i f d e f L L A M A _ D I S A B L E _ L O G S
CFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
e n d i f # LLAMA_DISABLE_LOGS
2023-03-28 18:48:20 +02:00
# warnings
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror= implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
2023-03-28 18:48:20 +02:00
2023-09-01 15:34:50 +02:00
i f e q '' '$(findstring clang++,$(CXX))'
# g++ only
CXXFLAGS += -Wno-format-truncation
e n d i f
2023-03-10 19:40:58 +01:00
# OS specific
# TODO: support Windows
2023-09-03 07:26:59 +02:00
i f n e q '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
MK_CFLAGS += -pthread
MK_CXXFLAGS += -pthread
2023-03-10 19:40:58 +01:00
e n d i f
2023-07-21 09:42:21 +02:00
# detect Windows
i f n e q ( $( findstring _NT ,$ ( UNAME_S ) ) , )
_WIN32 := 1
e n d i f
# library name prefix
i f n e q ( $( _WIN 32) , 1 )
LIB_PRE := lib
e n d i f
# Dynamic Shared Object extension
i f n e q ( $( _WIN 32) , 1 )
DSO_EXT := .so
e l s e
DSO_EXT := .dll
e n d i f
# Windows Sockets 2 (Winsock) for network-capable apps
i f e q ( $( _WIN 32) , 1 )
LWINSOCK2 := -lws2_32
e n d i f
2023-05-13 16:25:09 +02:00
i f d e f L L A M A _ G P R O F
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -pg
MK_CXXFLAGS += -pg
2023-05-13 16:25:09 +02:00
e n d i f
i f d e f L L A M A _ P E R F
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_PERF
2023-05-13 16:25:09 +02:00
e n d i f
2023-03-10 19:40:58 +01:00
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
2023-09-01 15:27:40 +02:00
i f n d e f R I S C V
2023-07-21 12:53:27 +02:00
i f e q ( $( UNAME_M ) , $( filter $ ( UNAME_M ) ,x 86_ 64 i 686 amd 64) )
2023-04-02 09:17:05 +02:00
# Use all CPU extensions that are available:
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -march= native -mtune= native
MK_CXXFLAGS += -march= native -mtune= native
2023-04-22 10:08:12 +02:00
# Usage AVX-only
2023-09-03 07:26:59 +02:00
#MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx
2023-06-10 08:41:59 +02:00
# Usage SSSE3-only (Not is SSE3!)
2023-09-03 07:26:59 +02:00
#MK_CFLAGS += -mssse3
#MK_CXXFLAGS += -mssse3
2023-03-10 19:40:58 +01:00
e n d i f
2023-06-04 22:34:30 +02:00
2023-09-01 15:53:14 +02:00
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
i f n e q '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
CFLAGS += -Xassembler -muse-unaligned-vector-move
CXXFLAGS += -Xassembler -muse-unaligned-vector-move
e n d i f
2023-08-07 08:21:46 +02:00
i f n e q ( $( filter aarch 64%,$ ( UNAME_M ) ) , )
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -mcpu= native
MK_CXXFLAGS += -mcpu= native
2023-08-07 08:21:46 +02:00
e n d i f
i f n e q ( $( filter armv 6%,$ ( UNAME_M ) ) , )
# Raspberry Pi 1, Zero
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -mfpu= neon-fp-armv8 -mfp16-format= ieee -mno-unaligned-access
MK_CXXFLAGS += -mfpu= neon-fp-armv8 -mfp16-format= ieee -mno-unaligned-access
2023-08-07 08:21:46 +02:00
e n d i f
i f n e q ( $( filter armv 7%,$ ( UNAME_M ) ) , )
# Raspberry Pi 2
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -mfpu= neon-fp-armv8 -mfp16-format= ieee -mno-unaligned-access -funsafe-math-optimizations
MK_CXXFLAGS += -mfpu= neon-fp-armv8 -mfp16-format= ieee -mno-unaligned-access -funsafe-math-optimizations
2023-08-07 08:21:46 +02:00
e n d i f
i f n e q ( $( filter armv 8%,$ ( UNAME_M ) ) , )
# Raspberry Pi 3, 4, Zero 2 (32-bit)
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -mfp16-format= ieee -mno-unaligned-access
MK_CXXFLAGS += -mfp16-format= ieee -mno-unaligned-access
2023-08-07 08:21:46 +02:00
e n d i f
2023-03-10 19:40:58 +01:00
i f n e q ( $( filter ppc 64%,$ ( UNAME_M ) ) , )
POWER9_M := $( shell grep "POWER9" /proc/cpuinfo)
ifneq ( ,$( findstring POWER9,$( POWER9_M) ) )
2023-09-03 07:26:59 +02:00
MK_CFLAGS += -mcpu= power9
MK_CXXFLAGS += -mcpu= power9
2023-03-10 19:40:58 +01:00
endif
e n d i f
2023-06-04 22:34:30 +02:00
2023-09-01 15:27:40 +02:00
e l s e
CFLAGS += -march= rv64gcv -mabi= lp64d
CXXFLAGS += -march= rv64gcv -mabi= lp64d
e n d i f
2023-06-07 09:59:52 +02:00
i f n d e f L L A M A _ N O _ K _ Q U A N T S
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
2023-06-07 09:59:52 +02:00
OBJS += k_quants.o
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 18:43:07 +02:00
i f d e f L L A M A _ Q K K _ 6 4
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_QKK_64
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights
* k_quants: WIP super-blocks with 64 weights
Q6_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q4_K scalar and AVX2 works
* k_quants: WIP super-blocks with 64 weights
Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)
* k_quants: WIP super-blocks with 64 weights
Q3_K scalar and AVX2 works.
* k_quants: WIP super-blocks with 64 weights
Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar
* k_quants: WIP super-blocks with 64 weights
Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,
* k_quants: WIP super-blocks with 64 weights
Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
* k_quants: WIP super-blocks with 64 weights
Q3_K working on CUDA.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on CUDA, and with this CUDA is done.
* k_quants: WIP super-blocks with 64 weights
Q6_K working on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Q4_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q2_K working on ARM_NEON, but quite a bit slower than 256 weights
* k_quants: WIP super-blocks with 64 weights
Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.
* k_quants: WIP super-blocks with 64 weights
Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.
With that, we have full support for ARM_NEON, although
performance is not quite there.
* k_quants: WIP super-blocks with 64 weights
Slightly more efficient Q3_K and Q5_K
* k_quants: WIP super-blocks with 64 weights
Another small improvement for Q3_K and Q5_K on ARM_NEON
* k_quants: WIP super-blocks with 64 weights
Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.
* k_quants: WIP super-blocks with 64 weights
* We are able to pass preprocessor macros to the Metal
compiler
* Q6_K works and is actually slightly more efficient than
the QK_K = 256 version (25.2 ms vs 25.8 ms)
* k_quants: WIP super-blocks with 64 weights
Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).
* k_quants: WIP super-blocks with 64 weights
Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).
* k_quants: WIP super-blocks with 64 weights
Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).
* k_quants: WIP super-blocks with 64 weights
Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).
* k_quants: call them _K, not _k, also on Metal
* k_quants: correctly define QK_K in llama.cpp
* Fixed bug in q4_K quantization added with the 64-block addition
* Simplify via lambda
* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.
* k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Here the loss in accuracy is greater than for Q3_K,
but the Q4_K points still move further to the left on
the perplexity vs size curve.
* k_quants: forgot to add the Metal changes in last commit
* k_quants: change Q5_K to be type 0 when QK_K = 64
Still needs AVX2 implementation
* k_quants: AVX2 implementation for new 64-weight Q5_K
* k_quants: 10% faster ARM_NEON Q5_K dot product
* k_quants: fixed issue caused by merging with master
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-26 18:43:07 +02:00
e n d i f
2023-06-07 09:59:52 +02:00
e n d i f
2023-03-11 11:26:16 +01:00
i f n d e f L L A M A _ N O _ A C C E L E R A T E
2023-03-21 16:44:11 +01:00
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
2023-03-10 19:40:58 +01:00
ifeq ( $( UNAME_S) ,Darwin)
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_LDFLAGS += -framework Accelerate
2023-03-10 19:40:58 +01:00
endif
2023-06-04 22:34:30 +02:00
e n d i f # LLAMA_NO_ACCELERATE
2023-07-10 17:49:56 +02:00
i f d e f L L A M A _ M P I
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
2023-07-10 17:49:56 +02:00
OBJS += ggml-mpi.o
e n d i f # LLAMA_MPI
2023-03-11 11:26:16 +01:00
i f d e f L L A M A _ O P E N B L A S
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $( shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $( shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $( shell pkg-config --libs openblas)
2023-06-04 22:34:30 +02:00
e n d i f # LLAMA_OPENBLAS
2023-05-20 16:58:31 +02:00
i f d e f L L A M A _ B L I S
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
2023-06-04 22:34:30 +02:00
e n d i f # LLAMA_BLIS
2023-04-19 11:22:45 +02:00
i f d e f L L A M A _ C U B L A S
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$( CUDA_PATH) /targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$( CUDA_PATH) /targets/x86_64-linux/lib
2023-04-21 21:59:17 +02:00
OBJS += ggml-cuda.o
2023-07-29 23:04:44 +02:00
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
2023-07-21 12:38:57 +02:00
i f d e f L L A M A _ C U D A _ N V C C
NVCC = $( LLAMA_CUDA_NVCC)
e l s e
NVCC = nvcc
e n d i f #LLAMA_CUDA_NVCC
2023-07-07 20:25:25 +02:00
i f d e f C U D A _ D O C K E R _ A R C H
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch= $( CUDA_DOCKER_ARCH)
e l s e
NVCCFLAGS += -arch= native
e n d i f # CUDA_DOCKER_ARCH
2023-07-05 14:19:42 +02:00
i f d e f L L A M A _ C U D A _ F O R C E _ D M M V
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
e n d i f # LLAMA_CUDA_FORCE_DMMV
2023-05-25 23:07:29 +02:00
i f d e f L L A M A _ C U D A _ D M M V _ X
NVCCFLAGS += -DGGML_CUDA_DMMV_X= $( LLAMA_CUDA_DMMV_X)
e l s e
NVCCFLAGS += -DGGML_CUDA_DMMV_X= 32
e n d i f # LLAMA_CUDA_DMMV_X
2023-07-05 14:19:42 +02:00
i f d e f L L A M A _ C U D A _ M M V _ Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y= $( LLAMA_CUDA_MMV_Y)
e l s e i f d e f L L A M A _ C U D A _ D M M V _ Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y= $( LLAMA_CUDA_DMMV_Y) # for backwards compatibility
2023-05-25 23:07:29 +02:00
e l s e
2023-07-05 14:19:42 +02:00
NVCCFLAGS += -DGGML_CUDA_MMV_Y= 1
e n d i f # LLAMA_CUDA_MMV_Y
2023-07-29 23:04:44 +02:00
i f d e f L L A M A _ C U D A _ F 1 6
NVCCFLAGS += -DGGML_CUDA_F16
e n d i f # LLAMA_CUDA_F16
2023-06-19 10:23:56 +02:00
i f d e f L L A M A _ C U D A _ D M M V _ F 1 6
2023-07-29 23:04:44 +02:00
NVCCFLAGS += -DGGML_CUDA_F16
2023-06-19 10:23:56 +02:00
e n d i f # LLAMA_CUDA_DMMV_F16
2023-06-16 19:08:44 +02:00
i f d e f L L A M A _ C U D A _ K Q U A N T S _ I T E R
NVCCFLAGS += -DK_QUANTS_PER_ITERATION= $( LLAMA_CUDA_KQUANTS_ITER)
e l s e
NVCCFLAGS += -DK_QUANTS_PER_ITERATION= 2
e n d i f
2023-07-31 15:44:35 +02:00
#ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS
2023-07-21 12:38:57 +02:00
i f d e f L L A M A _ C U D A _ C C B I N
2023-07-21 12:50:55 +02:00
NVCCFLAGS += -ccbin $( LLAMA_CUDA_CCBIN)
2023-07-21 12:38:57 +02:00
e n d i f
2023-04-20 03:14:14 +02:00
ggml-cuda.o : ggml -cuda .cu ggml -cuda .h
2023-07-31 21:02:19 +02:00
$( NVCC) $( NVCCFLAGS) $( subst -Ofast,-O3,$( CXXFLAGS) ) -Wno-pedantic -c $< -o $@
2023-05-25 23:07:29 +02:00
e n d i f # LLAMA_CUBLAS
2023-06-04 22:34:30 +02:00
ggml : add CLBlast support (#1164)
* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing
* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers
* Finish merge of ClBlast support
* Move CLBlast implementation to separate file
Add buffer reuse code (adapted from slaren's cuda implementation)
* Add q4_2 and q4_3 CLBlast support, improve code
* Double CLBlast speed by disabling OpenBLAS thread workaround
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
* Fix device selection env variable names
* Fix cast in opencl kernels
* Add CLBlast to CMakeLists.txt
* Replace buffer pool with static buffers a, b, qb, c
Fix compile warnings
* Fix typos, use GGML_TYPE defines, improve code
* Improve btype dequant kernel selection code, add error if type is unsupported
* Improve code quality
* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read
Co-authored-by: Henri Vasserman <henv@hot.ee>
* Use c compiler for opencl files
* Simplify code, fix include
* First check error, then release event
* Make globals static, fix indentation
* Rename dequant kernels file to conform with other file names
* Fix import cl file name
---------
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-28 16:57:16 +02:00
i f d e f L L A M A _ C L B L A S T
2023-07-23 13:52:08 +02:00
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_CLBLAST $( shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $( shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $( shell pkg-config --cflags-only-other clblast OpenCL)
2023-07-23 13:52:08 +02:00
2023-05-05 14:18:21 +02:00
# Mac provides OpenCL as a framework
ifeq ( $( UNAME_S) ,Darwin)
2023-09-03 07:26:59 +02:00
MK_LDFLAGS += -lclblast -framework OpenCL
2023-05-05 14:18:21 +02:00
else
2023-09-03 07:26:59 +02:00
MK_LDFLAGS += $( shell pkg-config --libs clblast OpenCL)
2023-05-05 14:18:21 +02:00
endif
ggml : add CLBlast support (#1164)
* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing
* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers
* Finish merge of ClBlast support
* Move CLBlast implementation to separate file
Add buffer reuse code (adapted from slaren's cuda implementation)
* Add q4_2 and q4_3 CLBlast support, improve code
* Double CLBlast speed by disabling OpenBLAS thread workaround
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
* Fix device selection env variable names
* Fix cast in opencl kernels
* Add CLBlast to CMakeLists.txt
* Replace buffer pool with static buffers a, b, qb, c
Fix compile warnings
* Fix typos, use GGML_TYPE defines, improve code
* Improve btype dequant kernel selection code, add error if type is unsupported
* Improve code quality
* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read
Co-authored-by: Henri Vasserman <henv@hot.ee>
* Use c compiler for opencl files
* Simplify code, fix include
* First check error, then release event
* Make globals static, fix indentation
* Rename dequant kernels file to conform with other file names
* Fix import cl file name
---------
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-28 16:57:16 +02:00
OBJS += ggml-opencl.o
2023-06-04 22:34:30 +02:00
2023-05-22 23:33:24 +02:00
ggml-opencl.o : ggml -opencl .cpp ggml -opencl .h
$( CXX) $( CXXFLAGS) -c $< -o $@
2023-06-04 22:34:30 +02:00
e n d i f # LLAMA_CLBLAST
2023-08-25 11:09:42 +02:00
i f d e f L L A M A _ H I P B L A S
ROCM_PATH ?= /opt/rocm
HIPCC ?= $( ROCM_PATH) /bin/hipcc
GPU_TARGETS ?= $( shell $( ROCM_PATH) /llvm/bin/amdgpu-arch)
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
MK_LDFLAGS += -L$( ROCM_PATH) /lib -Wl,-rpath= $( ROCM_PATH) /lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
2023-08-25 11:09:42 +02:00
HIPFLAGS += $( addprefix --offload-arch= ,$( GPU_TARGETS) )
HIPFLAGS += -DGGML_CUDA_DMMV_X= $( LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y= $( LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION= $( LLAMA_CUDA_KQUANTS_ITER)
HIPFLAGS += -DCC_TURING= 1000000000
i f d e f L L A M A _ C U D A _ F O R C E _ D M M V
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
e n d i f # LLAMA_CUDA_FORCE_DMMV
OBJS += ggml-cuda.o
ggml-cuda.o : ggml -cuda .cu ggml -cuda .h
$( HIPCC) $( CXXFLAGS) $( HIPFLAGS) -x hip -c -o $@ $<
e n d i f # LLAMA_HIPBLAS
2023-06-04 22:34:30 +02:00
i f d e f L L A M A _ M E T A L
2023-09-03 07:26:59 +02:00
MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
2023-06-04 22:34:30 +02:00
e n d i f # LLAMA_METAL
2023-07-14 19:34:40 +02:00
i f d e f L L A M A _ M E T A L
ggml-metal.o : ggml -metal .m ggml -metal .h
$( CC) $( CFLAGS) -c $< -o $@
e n d i f # LLAMA_METAL
i f d e f L L A M A _ M P I
ggml-mpi.o : ggml -mpi .c ggml -mpi .h
$( CC) $( CFLAGS) -c $< -o $@
e n d i f # LLAMA_MPI
2023-09-03 07:26:59 +02:00
i f n d e f L L A M A _ N O _ K _ Q U A N T S
2023-06-07 09:59:52 +02:00
k_quants.o : k_quants .c k_quants .h
$( CC) $( CFLAGS) -c $< -o $@
e n d i f # LLAMA_NO_K_QUANTS
2023-09-03 07:26:59 +02:00
# combine build flags with cmdline overrides
override CPPFLAGS : = $( MK_CPPFLAGS ) $( CPPFLAGS )
override CFLAGS : = $( MK_CFLAGS ) $( CFLAGS )
override CXXFLAGS : = $( MK_CXXFLAGS ) $( CXXFLAGS )
override LDFLAGS : = $( MK_LDFLAGS ) $( LDFLAGS )
2023-03-10 19:40:58 +01:00
#
# Print build information
#
$(info I llama.cpp build info : )
$(info I UNAME_S : $( UNAME_S ) )
$(info I UNAME_P : $( UNAME_P ) )
$(info I UNAME_M : $( UNAME_M ) )
$(info I CFLAGS : $( CFLAGS ) )
$(info I CXXFLAGS : $( CXXFLAGS ) )
$(info I LDFLAGS : $( LDFLAGS ) )
$(info I CC : $( CCV ) )
$(info I CXX : $( CXXV ) )
$( info )
#
# Build library
#
2023-06-07 09:59:52 +02:00
ggml.o : ggml .c ggml .h ggml -cuda .h
2023-04-14 21:39:48 +02:00
$( CC) $( CFLAGS) -c $< -o $@
2023-03-10 19:40:58 +01:00
2023-07-30 15:58:01 +02:00
ggml-alloc.o : ggml -alloc .c ggml .h ggml -alloc .h
$( CC) $( CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o
2023-08-21 22:07:43 +02:00
llama.o : llama .cpp ggml .h ggml -alloc .h ggml -cuda .h ggml -metal .h llama .h
2023-04-14 21:39:48 +02:00
$( CXX) $( CXXFLAGS) -c $< -o $@
2023-03-22 06:32:36 +01:00
main : log file (#2748)
* initial, base LOG macro
* add *.log to .gitignore
* added basic log file handler
* reverted log auto endline to better mimic printf
* remove atomics and add dynamic log target
* log_enable/disable, LOG_TEE, basic usage doc
* update .gitignore
* mv include to common, params, help msg
* log tostring helpers, token vectors pretty prints
* main: replaced fprintf/LOG_TEE, some trace logging
* LOG_DISABLE_LOGS compile flag, wrapped f in macros
* fix LOG_TEELN and configchecker
* stub LOG_DUMP_CMDLINE for WIN32 for now
* fix msvc
* cleanup main.cpp:273
* fix stray whitespace after master sync
* log : fix compile warnings
- do not use C++20 stuff
- use PRIu64 to print uint64_t
- avoid string copies by using const ref
- fix ", ##__VA_ARGS__" warnings
- compare strings with == and !=
* log : do not append to existing log + disable file line func by default
* log : try to fix Windows build
* main : wip logs
* main : add trace log
* review: macro f lowercase, str append to sstream
* review: simplify ifs and str comparisons
* fix MSVC, formatting, FMT/VAL placeholders
* review: if/else cleanup
* review: if/else cleanup (2)
* replace _ prefix with _impl suffix
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-08-30 08:29:32 +02:00
common.o : common /common .cpp common /common .h build -info .h common /log .h
2023-04-14 21:39:48 +02:00
$( CXX) $( CXXFLAGS) -c $< -o $@
2023-03-10 19:40:58 +01:00
2023-08-21 22:07:43 +02:00
console.o : common /console .cpp common /console .h
2023-08-04 17:20:12 +02:00
$( CXX) $( CXXFLAGS) -c $< -o $@
2023-08-21 22:07:43 +02:00
grammar-parser.o : common /grammar -parser .cpp common /grammar -parser .h
2023-07-24 05:58:10 +02:00
$( CXX) $( CXXFLAGS) -c $< -o $@
2023-06-07 09:59:52 +02:00
libllama.so : llama .o ggml .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) -shared -fPIC -o $@ $^ $( LDFLAGS)
2023-03-10 19:40:58 +01:00
clean :
2023-09-03 10:48:49 +02:00
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $( COV_TARGETS) $( BUILD_TARGETS) $( TEST_TARGETS)
2023-03-10 19:40:58 +01:00
2023-05-01 18:23:47 +02:00
#
# Examples
#
2023-08-04 17:20:12 +02:00
main : examples /main /main .cpp build -info .h ggml .o llama .o common .o console .o grammar -parser .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-03-23 12:41:32 +01:00
@echo
@echo '==== Run ./main -h for help. ===='
@echo
2023-03-10 19:40:58 +01:00
2023-06-16 20:58:09 +02:00
simple : examples /simple /simple .cpp build -info .h ggml .o llama .o common .o $( OBJS )
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-06-07 09:59:52 +02:00
quantize : examples /quantize /quantize .cpp build -info .h ggml .o llama .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-03-25 19:26:40 +01:00
2023-06-07 09:59:52 +02:00
quantize-stats : examples /quantize -stats /quantize -stats .cpp build -info .h ggml .o llama .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-04-08 00:09:18 +02:00
2023-06-07 09:59:52 +02:00
perplexity : examples /perplexity /perplexity .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-03-10 19:40:58 +01:00
2023-06-07 09:59:52 +02:00
embedding : examples /embedding /embedding .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-03-28 08:11:09 +02:00
2023-06-07 09:59:52 +02:00
save-load-state : examples /save -load -state /save -load -state .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-04-18 21:00:14 +02:00
2023-08-08 15:29:19 +02:00
server : examples /server /server .cpp examples /server /httplib .h examples /server /json .hpp examples /server /index .html .hpp examples /server /index .js .hpp examples /server /completion .js .hpp build -info .h ggml .o llama .o common .o grammar -parser .o $( OBJS )
2023-07-21 09:42:21 +02:00
$( CXX) $( CXXFLAGS) -Iexamples/server $( filter-out %.h,$( filter-out %.hpp,$^) ) -o $@ $( LDFLAGS) $( LWINSOCK2)
2023-05-27 19:04:14 +02:00
2023-07-21 09:42:21 +02:00
$(LIB_PRE)embdinput$(DSO_EXT) : examples /embd -input /embd -input .h examples /embd -input /embd -input -lib .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-06-28 17:53:37 +02:00
$( CXX) --shared $( CXXFLAGS) $( filter-out %.h,$( filter-out %.hpp,$^) ) -o $@ $( LDFLAGS)
2023-07-21 09:42:21 +02:00
embd-input-test : $( LIB_PRE ) embdinput $( DSO_EXT ) examples /embd -input /embd -input -test .cpp build -info .h ggml .o llama .o common .o $( OBJS )
$( CXX) $( CXXFLAGS) $( filter-out %$( DSO_EXT) ,$( filter-out %.h,$( filter-out %.hpp,$^) ) ) -o $@ $( LDFLAGS) -L. -lembdinput
2023-06-28 17:53:37 +02:00
2023-08-29 10:42:41 +02:00
gguf : examples /gguf /gguf .cpp ggml .o llama .o $( OBJS )
2023-08-21 22:07:43 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-29 10:42:41 +02:00
train-text-from-scratch : examples /train -text -from -scratch /train -text -from -scratch .cpp ggml .o llama .o common .o $( OBJS )
2023-06-15 19:42:48 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-29 10:42:41 +02:00
convert-llama2c-to-ggml : examples /convert -llama 2c -to -ggml /convert -llama 2c -to -ggml .cpp ggml .o llama .o $( OBJS )
2023-08-12 01:17:25 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-18 12:44:58 +02:00
llama-bench : examples /llama -bench /llama -bench .cpp build -info .h ggml .o llama .o common .o $( OBJS )
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-29 10:42:41 +02:00
baby-llama : examples /baby -llama /baby -llama .cpp ggml .o llama .o common .o $( OBJS )
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-30 11:52:46 +02:00
beam-search : examples /beam -search /beam -search .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-09-04 14:50:04 +02:00
speculative : examples /speculative /speculative .cpp build -info .h ggml .o llama .o common .o grammar -parser .o $( OBJS )
2023-09-04 12:39:57 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-29 10:42:41 +02:00
i f n e q '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
e n d i f
i f d e f L L A M A _ M E T A L
metal : examples /metal /metal .cpp ggml .o $( OBJS )
$( CXX) $( CXXFLAGS) $^ -o $@ $( LDFLAGS)
e n d i f
2023-05-01 18:23:47 +02:00
build-info.h : $( wildcard .git /index ) scripts /build -info .sh
2023-05-03 02:52:35 +02:00
@sh scripts/build-info.sh > $@ .tmp
2023-05-01 18:23:47 +02:00
@if ! cmp -s $@ .tmp $@ ; then \
mv $@ .tmp $@ ; \
else \
rm $@ .tmp; \
fi
2023-04-13 16:03:57 +02:00
2023-03-10 19:40:58 +01:00
#
# Tests
#
2023-07-21 12:09:16 +02:00
tests : $( TEST_TARGETS )
2023-06-07 09:59:52 +02:00
benchmark-matmult : examples /benchmark /benchmark -matmult .cpp build -info .h ggml .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-04-30 14:32:37 +02:00
./$@
2023-04-13 16:03:57 +02:00
2023-06-07 09:59:52 +02:00
vdot : pocs /vdot /vdot .cpp ggml .o $( OBJS )
2023-05-01 18:23:47 +02:00
$( CXX) $( CXXFLAGS) $^ -o $@ $( LDFLAGS)
2023-08-28 17:38:35 +02:00
tests/test-llama-grammar : tests /test -llama -grammar .cpp build -info .h ggml .o common .o grammar -parser .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-17 09:41:01 +02:00
2023-08-28 17:38:35 +02:00
tests/test-grammar-parser : tests /test -grammar -parser .cpp build -info .h ggml .o llama .o common .o grammar -parser .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-13 16:00:48 +02:00
2023-08-02 10:06:19 +02:00
tests/test-double-float : tests /test -double -float .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
2023-08-02 10:06:19 +02:00
tests/test-grad0 : tests /test -grad 0.cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
2023-08-02 10:06:19 +02:00
tests/test-opt : tests /test -opt .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
tests/test-quantize-fns : tests /test -quantize -fns .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
tests/test-quantize-perf : tests /test -quantize -perf .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
tests/test-sampling : tests /test -sampling .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-07-21 12:09:16 +02:00
2023-08-28 17:38:35 +02:00
tests/test-tokenizer-0-falcon : tests /test -tokenizer -0-falcon .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-28 17:38:35 +02:00
tests/test-tokenizer-0-llama : tests /test -tokenizer -0-llama .cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-28 17:38:35 +02:00
tests/test-tokenizer-1 : tests /test -tokenizer -1.cpp build -info .h ggml .o llama .o common .o $( OBJS )
2023-08-29 10:42:41 +02:00
$( CXX) $( CXXFLAGS) $( filter-out %.h,$^) -o $@ $( LDFLAGS)
2023-08-30 08:20:26 +02:00
tests/test-c.o : tests /test -c .c llama .h
$( CC) $( CFLAGS) -c $( filter-out %.h,$^) -o $@