mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 03:12:23 +01:00
Support all LLaMA models + change Q4_0 quantization storage
This commit is contained in:
parent
5f2f970d51
commit
007a8f6f45
42
README.md
42
README.md
@ -17,11 +17,10 @@ The main goal is to run the model using 4-bit quantization on a MacBook.
|
|||||||
|
|
||||||
This was hacked in an evening - I have no idea if it works correctly.
|
This was hacked in an evening - I have no idea if it works correctly.
|
||||||
|
|
||||||
So far, I've tested just the 7B model.
|
Here is a typical run using LLaMA-7B:
|
||||||
Here is a typical run:
|
|
||||||
|
|
||||||
```java
|
```java
|
||||||
make -j && ./main -m ../LLaMA-4bit/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
@ -34,7 +33,7 @@ I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202)
|
|||||||
|
|
||||||
make: Nothing to be done for `default'.
|
make: Nothing to be done for `default'.
|
||||||
main: seed = 1678486056
|
main: seed = 1678486056
|
||||||
llama_model_load: loading model from '../LLaMA-4bit/7B/ggml-model-q4_0.bin' - please wait ...
|
llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
|
||||||
llama_model_load: n_vocab = 32000
|
llama_model_load: n_vocab = 32000
|
||||||
llama_model_load: n_ctx = 512
|
llama_model_load: n_ctx = 512
|
||||||
llama_model_load: n_embd = 4096
|
llama_model_load: n_embd = 4096
|
||||||
@ -110,6 +109,8 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
Here are the step for the LLaMA-7B model:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# build this repo
|
# build this repo
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
@ -133,9 +134,40 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
|||||||
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
|
||||||
|
will create 2 ggml files, instead of one:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ggml-model-f16.bin
|
||||||
|
ggml-model-f16.bin.1
|
||||||
|
```
|
||||||
|
|
||||||
|
You need to quantize each of them separately like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./quantize ./models/13B/ggml-model-f16.bin ./models/13B/ggml-model-q4_0.bin 2
|
||||||
|
./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
|
||||||
|
```
|
||||||
|
|
||||||
|
Everything else is the same. Simply run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
|
||||||
|
```
|
||||||
|
|
||||||
|
The number of files generated for each model is as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
7B -> 1 file
|
||||||
|
13B -> 2 files
|
||||||
|
33B -> 4 files
|
||||||
|
65B -> 8 files
|
||||||
|
```
|
||||||
|
|
||||||
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
- Currently, only LLaMA-7B is supported since I haven't figured out how to merge the tensors of the bigger models. However, in theory, you should be able to run 65B on a 64GB MacBook
|
|
||||||
- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
|
- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
|
||||||
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
|
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
|
||||||
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
|
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
|
||||||
|
@ -33,12 +33,23 @@ if len(sys.argv) < 3:
|
|||||||
|
|
||||||
# output in the same directory as the model
|
# output in the same directory as the model
|
||||||
dir_model = sys.argv[1]
|
dir_model = sys.argv[1]
|
||||||
fname_out = sys.argv[1] + "/ggml-model.bin"
|
|
||||||
|
|
||||||
fname_hparams = sys.argv[1] + "/params.json"
|
fname_hparams = sys.argv[1] + "/params.json"
|
||||||
fname_model = sys.argv[1] + "/consolidated.00.pth"
|
|
||||||
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
|
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
|
||||||
|
|
||||||
|
def get_n_parts(dim):
|
||||||
|
if dim == 4096:
|
||||||
|
return 1
|
||||||
|
elif dim == 5120:
|
||||||
|
return 2
|
||||||
|
elif dim == 6656:
|
||||||
|
return 4
|
||||||
|
elif dim == 8192:
|
||||||
|
return 8
|
||||||
|
else:
|
||||||
|
print("Invalid dim: " + str(dim))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# possible data types
|
# possible data types
|
||||||
# ftype == 0 -> float32
|
# ftype == 0 -> float32
|
||||||
# ftype == 1 -> float16
|
# ftype == 1 -> float16
|
||||||
@ -61,23 +72,35 @@ tokenizer = SentencePieceProcessor(fname_tokenizer)
|
|||||||
|
|
||||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
||||||
|
|
||||||
|
n_parts = get_n_parts(hparams["dim"])
|
||||||
|
|
||||||
print(hparams)
|
print(hparams)
|
||||||
|
print('n_parts = ', n_parts)
|
||||||
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
for p in range(n_parts):
|
||||||
|
print('Processing part ', p)
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
#fname_model = sys.argv[1] + "/consolidated.00.pth"
|
||||||
|
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
|
||||||
|
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
||||||
|
if (p > 0):
|
||||||
|
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
model = torch.load(fname_model, map_location="cpu")
|
||||||
fout.write(struct.pack("i", hparams["vocab_size"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"]))
|
|
||||||
fout.write(struct.pack("i", hparams["multiple_of"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_heads"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_layers"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
|
|
||||||
fout.write(struct.pack("i", ftype))
|
|
||||||
|
|
||||||
# Is this correct??
|
fout = open(fname_out, "wb")
|
||||||
for i in range(32000):
|
|
||||||
|
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
||||||
|
fout.write(struct.pack("i", hparams["vocab_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["dim"]))
|
||||||
|
fout.write(struct.pack("i", hparams["multiple_of"]))
|
||||||
|
fout.write(struct.pack("i", hparams["n_heads"]))
|
||||||
|
fout.write(struct.pack("i", hparams["n_layers"]))
|
||||||
|
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
|
||||||
|
fout.write(struct.pack("i", ftype))
|
||||||
|
|
||||||
|
# Is this correct??
|
||||||
|
for i in range(32000):
|
||||||
# TODO: this is probably wrong - not sure how this tokenizer works
|
# TODO: this is probably wrong - not sure how this tokenizer works
|
||||||
text = tokenizer.decode([29889, i]).encode('utf-8')
|
text = tokenizer.decode([29889, i]).encode('utf-8')
|
||||||
# remove the first byte (it's always '.')
|
# remove the first byte (it's always '.')
|
||||||
@ -85,7 +108,7 @@ for i in range(32000):
|
|||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(struct.pack("i", len(text)))
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
|
||||||
for k, v in model.items():
|
for k, v in model.items():
|
||||||
name = k
|
name = k
|
||||||
shape = v.shape
|
shape = v.shape
|
||||||
|
|
||||||
@ -121,16 +144,19 @@ for k, v in model.items():
|
|||||||
ftype_cur = 0
|
ftype_cur = 0
|
||||||
|
|
||||||
# header
|
# header
|
||||||
str = name.encode('utf-8')
|
sname = name.encode('utf-8')
|
||||||
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
|
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
|
||||||
fout.write(str);
|
fout.write(sname);
|
||||||
|
|
||||||
# data
|
# data
|
||||||
data.tofile(fout)
|
data.tofile(fout)
|
||||||
|
|
||||||
fout.close()
|
# I hope this deallocates the memory ..
|
||||||
|
model = None
|
||||||
|
|
||||||
print("Done. Output file: " + fname_out)
|
fout.close()
|
||||||
print("")
|
|
||||||
|
print("Done. Output file: " + fname_out + ", (part ", p, ")")
|
||||||
|
print("")
|
||||||
|
97
ggml.c
97
ggml.c
@ -366,9 +366,10 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
float * restrict pd = (float *) (y);
|
uint8_t * restrict pd = (uint8_t *) (y + 0*bs);
|
||||||
uint8_t * restrict pb = (uint8_t *) (pd + nb);
|
uint8_t * restrict pb = (uint8_t *) (y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
uint8_t pp[QK/2];
|
uint8_t pp[QK/2];
|
||||||
|
|
||||||
@ -395,7 +396,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
const float d = amax / ((1 << 3) - 1);
|
const float d = amax / ((1 << 3) - 1);
|
||||||
const float id = d ? 1.0/d : 0.0;
|
const float id = d ? 1.0/d : 0.0;
|
||||||
|
|
||||||
pd[i] = d;
|
*(float *)pd = d;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
for (int l = 0; l < 8; l++) {
|
for (int l = 0; l < 8; l++) {
|
||||||
const float32x4_t v = vmulq_n_f32(srcv[l], id);
|
const float32x4_t v = vmulq_n_f32(srcv[l], id);
|
||||||
@ -406,7 +408,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*16, pp, sizeof(pp));
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#error "not implemented for QK"
|
#error "not implemented for QK"
|
||||||
@ -434,7 +437,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
const float d = amax / ((1 << 3) - 1);
|
const float d = amax / ((1 << 3) - 1);
|
||||||
const float id = d ? 1.0/d : 0.0;
|
const float id = d ? 1.0/d : 0.0;
|
||||||
|
|
||||||
pd[i] = d;
|
*(float *)pd = d;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
for (int l = 0; l < 8; l++) {
|
for (int l = 0; l < 8; l++) {
|
||||||
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
||||||
@ -445,7 +449,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
|
pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*16, pp, sizeof(pp));
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#error "not implemented for QK"
|
#error "not implemented for QK"
|
||||||
@ -463,7 +468,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
const float d = amax / ((1 << 3) - 1);
|
const float d = amax / ((1 << 3) - 1);
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
pd[i] = d;
|
*(float *)pd = d;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const float v0 = x[i*QK + l + 0]*id;
|
const float v0 = x[i*QK + l + 0]*id;
|
||||||
@ -478,7 +484,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*QK/2, pp, sizeof(pp));
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -535,15 +542,16 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
|
|||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
const float * restrict pd = (const float *) (x);
|
const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
|
||||||
|
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float d = pd[i];
|
const float d = *(const float *) (pd + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pp = pb + i*QK/2;
|
const uint8_t * restrict pp = pb + i*bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const uint8_t vi = pp[l/2];
|
const uint8_t vi = pp[l/2];
|
||||||
@ -554,6 +562,8 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
|
|||||||
const float v0 = (vi0 - 8)*d;
|
const float v0 = (vi0 - 8)*d;
|
||||||
const float v1 = (vi1 - 8)*d;
|
const float v1 = (vi1 - 8)*d;
|
||||||
|
|
||||||
|
//printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
|
||||||
|
|
||||||
y[i*QK + l + 0] = v0;
|
y[i*QK + l + 0] = v0;
|
||||||
y[i*QK + l + 1] = v1;
|
y[i*QK + l + 1] = v1;
|
||||||
|
|
||||||
@ -1179,11 +1189,13 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||||||
assert(n % QK == 0);
|
assert(n % QK == 0);
|
||||||
assert(nb % 2 == 0);
|
assert(nb % 2 == 0);
|
||||||
|
|
||||||
const float * restrict pd0 = (const float *) x;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
const float * restrict pd1 = (const float *) y;
|
|
||||||
|
|
||||||
const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
|
const uint8_t * restrict pd0 = (const uint8_t *) (x + 0*bs);
|
||||||
const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
|
const uint8_t * restrict pd1 = (const uint8_t *) (y + 0*bs);
|
||||||
|
|
||||||
|
const uint8_t * restrict pb0 = (const uint8_t *) (x + 0*bs + sizeof(float));
|
||||||
|
const uint8_t * restrict pb1 = (const uint8_t *) (y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
float sumf = 0.0;
|
float sumf = 0.0;
|
||||||
|
|
||||||
@ -1193,23 +1205,23 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||||||
float sum1 = 0.0f;
|
float sum1 = 0.0f;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i += 2) {
|
for (int i = 0; i < nb; i += 2) {
|
||||||
const float d0_0 = pd0[i + 0];
|
const float d0_0 = *(const float *) (pd0 + i*bs);
|
||||||
const float d1_0 = pd1[i + 0];
|
const float d1_0 = *(const float *) (pd1 + i*bs);
|
||||||
const float d0_1 = pd0[i + 1];
|
const float d0_1 = *(const float *) (pd0 + (i + 1)*bs);
|
||||||
const float d1_1 = pd1[i + 1];
|
const float d1_1 = *(const float *) (pd1 + (i + 1)*bs);
|
||||||
|
|
||||||
//printf("d0_0: %f, d1_0: %f, d0_1: %f, d1_1: %f\n", d0_0, d1_0, d0_1, d1_1);
|
//printf("d0_0: %f, d1_0: %f, d0_1: %f, d1_1: %f\n", d0_0, d1_0, d0_1, d1_1);
|
||||||
|
|
||||||
const uint8_t * restrict p0 = pb0 + i*16;
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||||
const uint8_t * restrict p1 = pb1 + i*16;
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
const int8x16_t s8b = vdupq_n_s8(0x8);
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
||||||
|
|
||||||
const uint8x16_t v0_0 = vld1q_u8(p0);
|
const uint8x16_t v0_0 = vld1q_u8(p0);
|
||||||
const uint8x16_t v1_0 = vld1q_u8(p1);
|
const uint8x16_t v1_0 = vld1q_u8(p1);
|
||||||
const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
|
const uint8x16_t v0_1 = vld1q_u8(p0 + bs);
|
||||||
const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
|
const uint8x16_t v1_1 = vld1q_u8(p1 + bs);
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
// 4-bit -> 8-bit
|
||||||
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
|
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
|
||||||
@ -1280,21 +1292,21 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||||||
float sum1 = 0.0f;
|
float sum1 = 0.0f;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i += 2) {
|
for (int i = 0; i < nb; i += 2) {
|
||||||
const float d0_0 = pd0[i + 0];
|
const float d0_0 = *(const float *) (pd0 + i*bs);
|
||||||
const float d0_1 = pd0[i + 1];
|
const float d1_0 = *(const float *) (pd1 + i*bs);
|
||||||
const float d1_0 = pd1[i + 0];
|
const float d0_1 = *(const float *) (pd0 + (i + 1)*bs);
|
||||||
const float d1_1 = pd1[i + 1];
|
const float d1_1 = *(const float *) (pd1 + (i + 1)*bs);
|
||||||
|
|
||||||
const uint8_t * restrict p0 = pb0 + i*16;
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||||
const uint8_t * restrict p1 = pb1 + i*16;
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||||
|
|
||||||
const v128_t m4b = wasm_u8x16_splat(0xf);
|
const v128_t m4b = wasm_u8x16_splat(0xf);
|
||||||
const v128_t s8b = wasm_i8x16_splat(0x8);
|
const v128_t s8b = wasm_i8x16_splat(0x8);
|
||||||
|
|
||||||
const v128_t v0_0 = wasm_v128_load(p0);
|
const v128_t v0_0 = wasm_v128_load(p0);
|
||||||
const v128_t v0_1 = wasm_v128_load(p0 + 16);
|
const v128_t v0_1 = wasm_v128_load(p0 + bs);
|
||||||
const v128_t v1_0 = wasm_v128_load(p1);
|
const v128_t v1_0 = wasm_v128_load(p1);
|
||||||
const v128_t v1_1 = wasm_v128_load(p1 + 16);
|
const v128_t v1_1 = wasm_v128_load(p1 + bs);
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
// 4-bit -> 8-bit
|
||||||
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
||||||
@ -1363,11 +1375,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||||||
#else
|
#else
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float d0 = pd0[i];
|
const float d0 = *(const float *) (pd0 + i*bs);
|
||||||
const float d1 = pd1[i];
|
const float d1 = *(const float *) (pd1 + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict p0 = pb0 + i*QK/2;
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
||||||
const uint8_t * restrict p1 = pb1 + i*QK/2;
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
||||||
|
|
||||||
for (int j = 0; j < QK/2; j++) {
|
for (int j = 0; j < QK/2; j++) {
|
||||||
const uint8_t v0 = p0[j];
|
const uint8_t v0 = p0[j];
|
||||||
@ -1552,16 +1564,17 @@ inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * res
|
|||||||
assert(n % QK == 0);
|
assert(n % QK == 0);
|
||||||
|
|
||||||
const int nb = n / QK;
|
const int nb = n / QK;
|
||||||
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
const float * restrict pd = (const float *) (x);
|
const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
|
const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
|
||||||
|
|
||||||
#if __ARM_NEON
|
#if __ARM_NEON
|
||||||
#if QK == 32
|
#if QK == 32
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
const float d0 = pd[i]*v;
|
const float d0 = v*(*(const float *) (pd + i*bs));
|
||||||
|
|
||||||
const uint8_t * restrict pp = pb + i*16;
|
const uint8_t * restrict pp = pb + i*bs;
|
||||||
|
|
||||||
const uint8x8_t m4b = vdup_n_u8(0xf);
|
const uint8x8_t m4b = vdup_n_u8(0xf);
|
||||||
const int8x8_t s8b = vdup_n_s8(0x8);
|
const int8x8_t s8b = vdup_n_s8(0x8);
|
||||||
@ -1615,9 +1628,9 @@ inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * res
|
|||||||
#else
|
#else
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
const float d = pd[i];
|
const float d = *(const float *) (pd + i*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pp = pb + i*QK/2;
|
const uint8_t * restrict pp = pb + i*bs;
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
for (int l = 0; l < QK; l += 2) {
|
||||||
const uint8_t vi = pp[l/2];
|
const uint8_t vi = pp[l/2];
|
||||||
|
129
main.cpp
129
main.cpp
@ -11,6 +11,14 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// determine number of model parts based on the dimension
|
||||||
|
static const std::map<int, int> LLAMA_N_PARTS = {
|
||||||
|
{ 4096, 1 },
|
||||||
|
{ 5120, 2 },
|
||||||
|
{ 6656, 4 },
|
||||||
|
{ 8192, 8 },
|
||||||
|
};
|
||||||
|
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
int32_t n_vocab = 32000;
|
int32_t n_vocab = 32000;
|
||||||
@ -82,6 +90,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
}
|
}
|
||||||
|
|
||||||
int n_ff = 0;
|
int n_ff = 0;
|
||||||
|
int n_parts = 0;
|
||||||
|
|
||||||
// load hparams
|
// load hparams
|
||||||
{
|
{
|
||||||
@ -99,6 +108,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
hparams.n_ctx = n_ctx;
|
hparams.n_ctx = n_ctx;
|
||||||
|
|
||||||
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
|
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
@ -109,6 +119,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||||
printf("%s: n_ff = %d\n", __func__, n_ff);
|
printf("%s: n_ff = %d\n", __func__, n_ff);
|
||||||
|
printf("%s: n_parts = %d\n", __func__, n_parts);
|
||||||
}
|
}
|
||||||
|
|
||||||
// load vocab
|
// load vocab
|
||||||
@ -282,6 +293,26 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const size_t file_offset = fin.tellg();
|
||||||
|
|
||||||
|
fin.close();
|
||||||
|
|
||||||
|
std::vector<uint8_t> tmp;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_parts; ++i) {
|
||||||
|
const int part_id = i;
|
||||||
|
//const int part_id = n_parts - i - 1;
|
||||||
|
|
||||||
|
std::string fname_part = fname;
|
||||||
|
if (i > 0) {
|
||||||
|
fname_part += "." + std::to_string(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
||||||
|
|
||||||
|
fin = std::ifstream(fname_part, std::ios::binary);
|
||||||
|
fin.seekg(file_offset);
|
||||||
|
|
||||||
// load weights
|
// load weights
|
||||||
{
|
{
|
||||||
int n_tensors = 0;
|
int n_tensors = 0;
|
||||||
@ -317,21 +348,77 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// split_type = 0: split by columns
|
||||||
|
// split_type = 1: split by rows
|
||||||
|
int split_type = 0;
|
||||||
|
|
||||||
|
// split_type = 0:
|
||||||
|
// regex:
|
||||||
|
// - tok_embeddings.*
|
||||||
|
// - layers.*.attention.wo.weight
|
||||||
|
// - layers.*.feed_forward.w2.weight
|
||||||
|
|
||||||
|
// split_type = 1:
|
||||||
|
// regex:
|
||||||
|
// - output.*
|
||||||
|
// - layers.*.attention.wq.weight
|
||||||
|
// - layers.*.attention.wk.weight
|
||||||
|
// - layers.*.attention.wv.weight
|
||||||
|
// - layers.*.feed_forward.w1.weight
|
||||||
|
// - layers.*.feed_forward.w3.weight
|
||||||
|
if (name.find("tok_embeddings") != std::string::npos) {
|
||||||
|
split_type = 0;
|
||||||
|
} else if (name.find("layers") != std::string::npos) {
|
||||||
|
if (name.find("attention.wo.weight") != std::string::npos) {
|
||||||
|
split_type = 0;
|
||||||
|
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
|
split_type = 0;
|
||||||
|
} else {
|
||||||
|
split_type = 1;
|
||||||
|
}
|
||||||
|
} else if (name.find("output") != std::string::npos) {
|
||||||
|
split_type = 1;
|
||||||
|
}
|
||||||
|
|
||||||
auto tensor = model.tensors[name.data()];
|
auto tensor = model.tensors[name.data()];
|
||||||
|
|
||||||
|
if (n_dims == 1) {
|
||||||
if (ggml_nelements(tensor) != nelements) {
|
if (ggml_nelements(tensor) != nelements) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (ggml_nelements(tensor)/n_parts != nelements) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_dims == 1) {
|
||||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (split_type == 0) {
|
||||||
|
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||||
|
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||||
|
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (0) {
|
if (0) {
|
||||||
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t bpe = 0;
|
size_t bpe = 0;
|
||||||
@ -348,16 +435,53 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (n_dims == 1 || n_parts == 1) {
|
||||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (part_id == 0) {
|
||||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||||
|
} else {
|
||||||
|
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
total_size += ggml_nbytes(tensor);
|
||||||
|
} else {
|
||||||
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||||
|
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (split_type == 0) {
|
||||||
|
const int np0 = ne[0];
|
||||||
|
|
||||||
|
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||||
|
assert(row_size == tensor->nb[1]);
|
||||||
|
|
||||||
|
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
||||||
|
const size_t offset_row = i1*row_size;
|
||||||
|
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||||
|
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const int np1 = ne[1];
|
||||||
|
|
||||||
|
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
||||||
|
|
||||||
|
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
||||||
|
const size_t offset_row = (i1 + part_id*np1)*row_size;
|
||||||
|
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
total_size += ggml_nbytes(tensor)/n_parts;
|
||||||
|
}
|
||||||
|
|
||||||
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
total_size += ggml_nbytes(tensor);
|
|
||||||
if (++n_tensors % 8 == 0) {
|
if (++n_tensors % 8 == 0) {
|
||||||
printf(".");
|
printf(".");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
@ -370,6 +494,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
}
|
}
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
13
utils.cpp
13
utils.cpp
@ -448,7 +448,8 @@ gpt_vocab::id llama_sample_top_p(
|
|||||||
|
|
||||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||||
const int nb = k / qk;
|
const int nb = k / qk;
|
||||||
const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*qk/2);
|
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||||
|
const size_t row_size = nb*bs;
|
||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
@ -457,8 +458,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
float * pd = (float *) (pdst + (j/k)*row_size);
|
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||||
uint8_t * pb = (uint8_t *) (pd + nb);
|
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
float amax = 0.0f; // absolute max
|
float amax = 0.0f; // absolute max
|
||||||
@ -472,7 +473,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
const float d = amax / ((1 << 3) - 1);
|
const float d = amax / ((1 << 3) - 1);
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
pd[i] = d;
|
*(float *) pd = d;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
for (int l = 0; l < qk; l += 2) {
|
for (int l = 0; l < qk; l += 2) {
|
||||||
const float v0 = (src[j + i*qk + l + 0])*id;
|
const float v0 = (src[j + i*qk + l + 0])*id;
|
||||||
@ -490,7 +492,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*qk/2, pp, sizeof(pp));
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user