diff --git a/.devops/tools.sh b/.devops/tools.sh index b9a590d82..1ed20d56b 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -10,11 +10,11 @@ shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then python3 ./convert-hf-to-gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./quantize "$@" + ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./llama "$@" elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then - ./finetune "$@" + ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -22,7 +22,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./quantize "$i" "${i/f16/q4_0}" q4_0 + ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then diff --git a/.dockerignore b/.dockerignore index f29a6dc31..42d0ad165 100644 --- a/.dockerignore +++ b/.dockerignore @@ -13,7 +13,7 @@ build*/ models/* /llama -/quantize +/llama-quantize arm_neon.h compile_commands.json diff --git a/.gitignore b/.gitignore index 8df35fe27..ae40f0653 100644 --- a/.gitignore +++ b/.gitignore @@ -46,48 +46,45 @@ models/* models-mnt /Pipfile -/baby-llama -/beam-search -/benchmark-matmult -/convert-llama2c-to-ggml -/embd-input-test -/embedding -/eval-callback -/gguf -/gguf-llama-simple -/gguf-split -/gritlm -/imatrix -/infill +/llama-baby +/llama-beam-search +/llama-benchmark-matmult +/llama-convert-llama2c-to-ggml +/llama-embedding +/llama-eval-callback +/llama-gguf +/llama-gguf-llama-simple +/llama-gguf-split +/llama-gritlm +/llama-imatrix +/llama-infill /libllama.so /llama /llama-bench -/llava-cli +/llava /llama-server -/lookahead -/lookup -/lookup-create -/lookup-merge -/lookup-stats -/metal -/passkey -/perplexity -/q8dot -/quantize -/quantize-stats -/result -/save-load-state -/simple -/batched -/batched-bench -/export-lora -/finetune -/retrieval -/speculative -/parallel -/train-text-from-scratch -/tokenize -/vdot +/llama-lookahead +/llama-lookup +/llama-lookup-create +/llama-lookup-merge +/llama-lookup-stats +/llama-passkey +/llama-perplexity +/llama-q8dot +/llama-quantize +/llama-quantize-stats +/llama-save-load-state +/llama-simple +/llama-batched +/llama-batched-bench +/llama-export-lora +/llama-finetune +/llama-retrieval +/llama-speculative +/llama-parallel +/llama-train-text-from-scratch +/llama-tokenize +/llama-vdot /common/build-info.cpp arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index 25d946ec9..74d681149 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,43 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - llama quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state llama-server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o + llama \ + llama-baby \ + llama-batched \ + llama-batched-bench \ + llama-bench \ + llama-benchmark-matmult \ + llama-convert-llama2c-to-ggml \ + llama-embedding \ + llama-eval-callback \ + llama-export-lora \ + llama-finetune \ + llama-gguf \ + llama-gguf-split \ + llama-gritlm \ + llama-imatrix \ + llama-infill \ + llama-libllava.a \ + llama-llava \ + llama-lookahead \ + llama-lookup \ + llama-lookup-create \ + llama-lookup-merge \ + llama-lookup-stats \ + llama-parallel \ + llama-passkey \ + llama-perplexity \ + llama-q8dot \ + llama-quantize \ + llama-quantize-stats \ + llama-retrieval \ + llama-save-load-state \ + llama-server \ + llama-simple \ + llama-speculative \ + llama-tokenize \ + llama-train-text-from-scratch \ + llama-vdot \ + tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -777,7 +812,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: - rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf ggml-cuda/*.o rm -vrf ggml-cuda/template-instances/*.o find examples pocs -type f -name "*.o" -delete @@ -800,51 +835,51 @@ llama: examples/main/main.cpp ggml.o llama.o $( @echo '==== Run ./llama -h for help. ====' @echo -infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) +llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -861,23 +896,23 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) +llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) +llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -888,41 +923,41 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual -llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-llava: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-baby: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) +llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) @@ -932,11 +967,11 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS) -passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -962,20 +997,20 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) +llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -run-benchmark-matmult: benchmark-matmult +run-benchmark-matmult: llama-benchmark-matmult ./$@ .PHONY: run-benchmark-matmult swift -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) +llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) +llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index f74d54147..d4b895df3 100644 --- a/README.md +++ b/README.md @@ -622,10 +622,10 @@ python3 convert-hf-to-gguf.py models/mymodel/ python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe # quantize the model to 4-bits (using Q4_K_M method) -./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M +./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M # update the gguf filetype to current version if older version is now unsupported -./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY ``` ### Run the quantized model @@ -707,7 +707,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread #### How to run 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip -2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` +2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 3. Output: ``` perplexity : calculating perplexity over 655 chunks @@ -740,7 +740,7 @@ Note the use of `--color` to distinguish between user input and generated text. ### Persistent Interaction -The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. +The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. ```bash # Start a new chat diff --git a/baby-llama b/baby-llama new file mode 100755 index 000000000..5e6117262 Binary files /dev/null and b/baby-llama differ diff --git a/ci/run.sh b/ci/run.sh index d74eecc2a..f3d1ea70d 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -303,16 +303,16 @@ function gg_run_open_llama_7b_v2 { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k (time ./bin/llama --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -326,24 +326,24 @@ function gg_run_open_llama_7b_v2 { (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -437,16 +437,16 @@ function gg_run_pythia_1_4b { wiki_test_60="${path_wiki}/wiki.test-60.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k (time ./bin/llama --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -460,22 +460,22 @@ function gg_run_pythia_1_4b { (time ./bin/llama --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -569,16 +569,16 @@ function gg_run_pythia_2_8b { wiki_test="${path_wiki}/wiki.test.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k + ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k + ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k + ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k + ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k (time ./bin/llama --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -592,24 +592,24 @@ function gg_run_pythia_2_8b { (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -693,10 +693,10 @@ function gg_run_embd_bge_small { model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log set +e } diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index 7b70227a5..c7a3062d5 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET baby-llama) +set(TARGET llama-baby) add_executable(${TARGET} baby-llama.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 40a032c51..959acaeee 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET batched-bench) +set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index fa4baf640..4a07fe6bb 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] +./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 +./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps # custom set of batches -./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile index 2afb24fb8..1f9156e58 100755 --- a/examples/batched.swift/Makefile +++ b/examples/batched.swift/Makefile @@ -1,6 +1,6 @@ .PHONY: build build: - xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build - rm -f ./batched_swift - ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift + xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build + rm -f ./llama-batched-swift + ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift index 826491def..7e8afd084 100644 --- a/examples/batched.swift/Package.swift +++ b/examples/batched.swift/Package.swift @@ -4,7 +4,7 @@ import PackageDescription let package = Package( - name: "batched_swift", + name: "llama-batched-swift", platforms: [.macOS(.v12)], dependencies: [ .package(name: "llama", path: "../../"), @@ -13,7 +13,7 @@ let package = Package( // Targets are the basic building blocks of a package, defining a module or a test suite. // Targets can depend on other targets in this package and products from dependencies. .executableTarget( - name: "batched_swift", + name: "llama-batched-swift", dependencies: ["llama"], path: "Sources", linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 4c2721fe8..7f2e2fcdc 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,4 @@ This is a swift clone of `examples/batched`. $ `make` -$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]` +$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched.swift/llama-batched-swift b/examples/batched.swift/llama-batched-swift new file mode 120000 index 000000000..319ca9771 --- /dev/null +++ b/examples/batched.swift/llama-batched-swift @@ -0,0 +1 @@ +./build/Build/Products/Debug/llama-batched-swift \ No newline at end of file diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 6aa178d4d..77e33343b 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET batched) +set(TARGET llama-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/batched/README.md b/examples/batched/README.md index ed204c308..6013aab01 100644 --- a/examples/batched/README.md +++ b/examples/batched/README.md @@ -3,7 +3,7 @@ The example demonstrates batched generation from a given prompt ```bash -./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 +./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ... diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 2bb47bab5..34a58cc02 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET benchmark) +set(TARGET llama-bench-matmult) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt index e262d44f9..a6790e617 100644 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET convert-llama2c-to-ggml) +set(TARGET llama-convert-llama2c-to-ggml) add_executable(${TARGET} convert-llama2c-to-ggml.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 57c259249..1dc67a23c 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu After successful compilation, following usage options are available: ``` -usage: ./convert-llama2c-to-ggml [options] +usage: ./llama-convert-llama2c-to-ggml [options] options: -h, --help show this help message and exit @@ -19,7 +19,7 @@ options: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: -`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` +`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 8ffc33868..8256e789a 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET embedding) +set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 6929454c5..2298ec3e7 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index c56ba780b..87ba1c515 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET eval-callback) +set(TARGET llama-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index cbbdaec67..1cef6e716 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET export-lora) +set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 0cf3e8e45..c3a35feac 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -17,7 +17,7 @@ options: For example: ```bash -./bin/export-lora \ +./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt index 2b52d21cf..64afe6ddc 100644 --- a/examples/finetune/CMakeLists.txt +++ b/examples/finetune/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET finetune) +set(TARGET llama-finetune) add_executable(${TARGET} finetune.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/finetune/README.md b/examples/finetune/README.md index b07dbf67c..814430cfb 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -7,7 +7,7 @@ Basic usage instructions: wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt # finetune LORA adapter -./bin/finetune \ +./bin/llama-finetune \ --model-base open-llama-3b-v2-q8_0.gguf \ --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh index 079bfa113..d7f2165e5 100644 --- a/examples/finetune/finetune.sh +++ b/examples/finetune/finetune.sh @@ -2,7 +2,7 @@ cd `dirname $0` cd ../.. -EXE="./finetune" +EXE="./llama-finetune" if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index 166e3ad2a..e1114bbb7 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gbnf-validator) +set(TARGET llama-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index 828e62435..f63887da7 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gguf-split) +set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index 6481f087b..a9569b411 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gguf) +set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index ac4a5ae79..86dfddca3 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET gritlm) +set(TARGET llama-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md index a3a3c1389..786ba5736 100644 --- a/examples/gritlm/README.md +++ b/examples/gritlm/README.md @@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou Run the example using the downloaded model: ```console -$ ./gritlm -m models/gritlm-7b_q4_1.gguf +$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index d688a1620..d4c8265bd 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET imatrix) +set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 866ca9f56..38b36ee5a 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/ ## Usage ``` -./imatrix \ +./llama-imatrix \ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] @@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument LLAMA_CUDA=1 make -j # generate importance matrix (imatrix.dat) -./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 +./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization -./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index e4e8028da..9b1aa3b63 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET infill) +set(TARGET llama-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/infill/README.md b/examples/infill/README.md index 6b076c839..74f42d2fc 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu ``` ```bash -./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " +./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " ``` diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 2985caff8..12ffc6b7b 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -32,6 +32,7 @@ endif() set(TARGET llava-cli) add_executable(llava-cli llava-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava) install(TARGETS llava-cli RUNTIME) target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(llava PRIVATE cxx_std_11) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 74f021dec..a35d6fd2b 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. ## Usage -Build with cmake or run `make llava-cli` to build it. +Build with cmake or run `make llama-llava` to build it. -After building, run: `./llava-cli` to see the usage. For example: +After building, run: `./llama-llava` to see the usage. For example: ```sh -./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ +./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ --image path/to/an/image.jpg \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" @@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` ```sh -./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ``` Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. @@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path` ### case 1 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms ### case 2 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms #### llava-cli release-b2005 **input** ```sh -/data/local/tmp/llava-cli \ +/data/local/tmp/llama-llava \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -t 4 \ @@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 ### case 1 **input** ```sh -./llava-cli \ +./llama-llava \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ --image /data/local/tmp/demo.jpeg \ @@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens ### case 2 **input** ```sh -./llava-cli \ +./llama-llava \ -m /data/local/tmp/ggml-model-q4_k.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" \ diff --git a/examples/llava/README.md b/examples/llava/README.md index 8d1ae5270..f808297f1 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h After API is confirmed, more models will be supported / uploaded. ## Usage -Build with cmake or run `make llava-cli` to build it. +Build with cmake or run `make llama-llava` to build it. -After building, run: `./llava-cli` to see the usage. For example: +After building, run: `./llama-llava` to see the usage. For example: ```sh -./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg +./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. @@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -7) And finally we can run the llava-cli using the 1.6 model version: +7) And finally we can run the llava cli using the 1.6 model version: ```console -./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 +./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh index f73623ae3..aca375d07 100755 --- a/examples/llava/android/adb_run.sh +++ b/examples/llava/android/adb_run.sh @@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant. # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" program_dir="build_64/bin" -binName="llava-cli" +binName="llama-llava" n_threads=4 diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 9e7a119ba..a48a6283a 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -8,5 +8,5 @@ See the following PRs for more info: ### Usage ```bash -make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 +make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 ``` diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md index 2b2595c46..bc5f22e2f 100644 --- a/examples/retrieval/README.md +++ b/examples/retrieval/README.md @@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193 `retrieval` example can be tested as follows: ```bash -make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . +make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . ``` This chunks and embeds all given files and starts a loop requesting query inputs: diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index 31054b4b1..d40f53d2e 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -7,7 +7,7 @@ Basic usage instructions: wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt # train -./bin/train-text-from-scratch \ +./bin/llama-train-text-from-scratch \ --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ --checkpoint-in chk-shakespeare-256x16-LATEST.gguf \ diff --git a/flake.nix b/flake.nix index 0a52ea52e..c69637d11 100644 --- a/flake.nix +++ b/flake.nix @@ -63,7 +63,7 @@ # nix-repl> :lf github:ggerganov/llama.cpp # Added 13 variables. # nix-repl> outputs.apps.x86_64-linux.quantize - # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; } + # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; } # ``` outputs = { self, flake-parts, ... }@inputs: diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt index fb89a1cd4..d5405ad29 100644 --- a/pocs/vdot/CMakeLists.txt +++ b/pocs/vdot/CMakeLists.txt @@ -1,9 +1,9 @@ -set(TARGET vdot) +set(TARGET llama-vdot) add_executable(${TARGET} vdot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -set(TARGET q8dot) +set(TARGET llama-q8dot) add_executable(${TARGET} q8dot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh index 121979fe2..4e1b1cc15 100755 --- a/scripts/get-hellaswag.sh +++ b/scripts/get-hellaswag.sh @@ -4,7 +4,7 @@ wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" +echo " ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" echo "" exit 0 diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh index 880dd5cbe..9c65fafbc 100755 --- a/scripts/get-wikitext-103.sh +++ b/scripts/get-wikitext-103.sh @@ -4,7 +4,7 @@ wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1. echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo " ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]" echo "" exit 0 diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh index b01476a46..5f3845ef5 100755 --- a/scripts/get-wikitext-2.sh +++ b/scripts/get-wikitext-2.sh @@ -5,7 +5,7 @@ unzip wikitext-2-raw-v1.zip echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" +echo " ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" echo "" exit 0 diff --git a/scripts/get-winogrande.sh b/scripts/get-winogrande.sh index 5f234468e..f1fc0e2d4 100755 --- a/scripts/get-winogrande.sh +++ b/scripts/get-winogrande.sh @@ -4,7 +4,7 @@ wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]" +echo " ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]" echo "" exit 0 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 5dabbf60e..6ba499a2a 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -77,9 +77,9 @@ if [ "$1" -eq "1" ]; then python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "2" ]; then @@ -92,9 +92,9 @@ if [ "$1" -eq "2" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "3" ]; then @@ -107,9 +107,9 @@ if [ "$1" -eq "3" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "4" ]; then @@ -122,9 +122,9 @@ if [ "$1" -eq "4" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "5" ]; then @@ -137,9 +137,9 @@ if [ "$1" -eq "5" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "6" ]; then @@ -152,9 +152,9 @@ if [ "$1" -eq "6" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "7" ]; then @@ -167,9 +167,9 @@ if [ "$1" -eq "7" ]; then python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k - ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k + ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 fi if [ "$1" -eq "1" ]; then @@ -181,22 +181,22 @@ if [ "$1" -eq "1" ]; then ../scripts/get-wikitext-2.sh unzip wikitext-2-raw-v1.zip - make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 + make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 # batched cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh index b4c2a159e..bc43738a2 100755 --- a/scripts/qnt-all.sh +++ b/scripts/qnt-all.sh @@ -26,5 +26,5 @@ set -e mkdir -p ${out} for q in ${qnt[@]}; do - time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt + time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt done diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh index e04d61d7f..e15f74f1b 100755 --- a/scripts/run-all-ppl.sh +++ b/scripts/run-all-ppl.sh @@ -26,5 +26,5 @@ out="../tmp/results-${model}" mkdir -p ${out} for q in ${qnt[@]}; do - time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt + time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt done