mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
799a1cb13b
* convert : support Mixtral as LLAMA arch
* convert : fix n_ff typo
* llama : model loading
* ggml : sync latest ggml_mul_mat_id
* llama : update graph to support MoE
* llama : fix cur -> cur_expert
* llama : first working version
* llama : fix expert weighting in the FFN
* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)
* ggml : add n_as argument to ggml_mul_mat_id
* ggml : fix ggml_get_rows to take into account ne02 / ne11
* metal : add more general support for ggml_get_rows + tests
* llama : add basic support for offloading moe with CUDA
* metal : add/mul/div use general kernel when src1 not cont
* metal : reduce the kernel launches for ggml_mul_mat_id
* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D
* ggml : update get_rows f16 and q
* cuda : support non-contiguous src1 in get_rows
* llama : offload missing ffn_moe_silu
* metal : fix ggml_get_rows to work with non-cont src1
* metal : add indirect mat-vec kernels for all quantization types
* llama : do not quantize expert gating tensors
* llama : add n_expert and n_expert_used to hparams + change quants
* test-backend-ops : add moe test
* cuda : fix get_rows when ncols is odd
* convert : determine n_ctx correctly
* metal : fix ggml_mul_mat_id for F32
* test-backend-ops : make experts more evenly probable (test_moe)
* test-backend-ops : cleanup, add moe test for batches
* test-backend-ops : add cpy from f32 -> all types test
* test-backend-ops : fix dequantize block offset
* llama : fix hard-coded number of experts
* test-backend-ops : simplify and disable slow tests to avoid CI timeout
* test-backend-ops : disable MOE test with thread sanitizer
* cuda : fix mul_mat_id with multi gpu
* convert : use 1e6 rope_freq_base for mixtral
* convert : fix style
* convert : support safetensors format
* gguf-py : bump version
* metal : add cpy f16 -> f32 kernel
* metal : fix binary ops for ne10 % 4 != 0
* test-backend-ops : add one more sum_rows test
* ggml : do not use BLAS with ggml_mul_mat_id
* convert-hf : support for mixtral-instruct (#4428)
* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct
* convert : use sentencepiece tokenizer for Mixtral-instruct
* convert : make flake8 happy
* metal : fix soft_max kernels
ref: 1914017863
* metal : limit kernels to not use more than the allowed threads
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
36 lines
921 B
TOML
36 lines
921 B
TOML
[tool.poetry]
|
|
name = "gguf"
|
|
version = "0.7.0"
|
|
description = "Read and write ML models in GGUF for GGML"
|
|
authors = ["GGML <ggml@ggml.ai>"]
|
|
packages = [
|
|
{include = "gguf"},
|
|
{include = "gguf/py.typed"},
|
|
{include = "scripts"},
|
|
]
|
|
readme = "README.md"
|
|
homepage = "https://ggml.ai"
|
|
repository = "https://github.com/ggerganov/llama.cpp"
|
|
keywords = ["ggml", "gguf", "llama.cpp"]
|
|
classifiers = [
|
|
"Programming Language :: Python :: 3",
|
|
"License :: OSI Approved :: MIT License",
|
|
"Operating System :: OS Independent",
|
|
]
|
|
|
|
[tool.poetry.dependencies]
|
|
python = ">=3.8"
|
|
numpy = ">=1.17"
|
|
|
|
[tool.poetry.dev-dependencies]
|
|
pytest = "^5.2"
|
|
|
|
[build-system]
|
|
requires = ["poetry-core>=1.0.0"]
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
[tool.poetry.scripts]
|
|
gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
|
|
gguf-dump = "scripts:gguf_dump_entrypoint"
|
|
gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
|