mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-29 11:09:33 +01:00
dev note on tensor encoding LUT
parent
212923661c
commit
d677b5cb7d
113
dev-notes.md
113
dev-notes.md
@ -60,3 +60,116 @@ Aka it's for the writing/reading api.
|
|||||||
There is this cpp example program that will write a test gguf write/read
|
There is this cpp example program that will write a test gguf write/read
|
||||||
|
|
||||||
- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp)
|
- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp)
|
||||||
|
|
||||||
|
|
||||||
|
### If we don't store the size tensor array elements etc in gguf where do we store these?
|
||||||
|
|
||||||
|
In ggml.c refer to `static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT]`
|
||||||
|
which is a lookup table containing enough information to deduce the size of a tensor layer
|
||||||
|
in bytes if given an offset and element dimension count.
|
||||||
|
|
||||||
|
One good example is shown below (but annotated for clarity):
|
||||||
|
|
||||||
|
```c
|
||||||
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
|
...
|
||||||
|
[GGML_TYPE_F16] = {
|
||||||
|
// General Specs About This Tensor Encoding Scheme
|
||||||
|
.type_name = "f16",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(ggml_fp16_t),
|
||||||
|
.is_quantized = false,
|
||||||
|
|
||||||
|
// C function methods for interpreting the blocks
|
||||||
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||||
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
|
|
||||||
|
// C functions methods plus extra specs required for dot product handling
|
||||||
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
So basically these are used in various places to help allow the developers to
|
||||||
|
get a sense of the tensor encoding spec and sizing as you can see with the
|
||||||
|
getter methods below (Note didn't trace fully the other functions directly using
|
||||||
|
the values within ggml.c, the few in this graph is just for illustrative purpose):
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph LR;
|
||||||
|
type_traits{"type_traits[]\n Lookup Table"}
|
||||||
|
type_traits-->type_name
|
||||||
|
type_traits-->blck_size
|
||||||
|
type_traits-->type_size
|
||||||
|
type_traits-->is_quantized
|
||||||
|
%%type_traits-->to_float
|
||||||
|
%%type_traits-->from_float
|
||||||
|
%%type_traits-->from_float_reference
|
||||||
|
%%type_traits-->vec_dot
|
||||||
|
%%type_traits-->vec_dot_type
|
||||||
|
%%type_traits-->nrows
|
||||||
|
|
||||||
|
subgraph getter functions / methods
|
||||||
|
ggml_type_name(["ggml_type_name()"])
|
||||||
|
ggml_blck_size(["ggml_blck_size()"])
|
||||||
|
ggml_type_size(["ggml_type_size()"])
|
||||||
|
ggml_is_quantized(["ggml_is_quantized()"])
|
||||||
|
end
|
||||||
|
type_name --> ggml_type_name(["ggml_type_name()"])
|
||||||
|
blck_size --> ggml_blck_size(["ggml_blck_size()"])
|
||||||
|
type_size --> ggml_type_size(["ggml_type_size()"])
|
||||||
|
is_quantized --> ggml_is_quantized(["ggml_is_quantized()"])
|
||||||
|
|
||||||
|
blck_size --> ggml_type_sizef(["ggml_type_sizef()"])
|
||||||
|
blck_size --> ggml_quantize_chunk(["ggml_quantize_chunk()"])
|
||||||
|
```
|
||||||
|
|
||||||
|
This is how the LUT is used to convert a tensor data area to/from float for processing
|
||||||
|
(However these methods is not used in the GPU if i understand as these data area is processed directly using GPU specific instruction code.
|
||||||
|
This is also why the tensors elements has to be packed in a certain way.)
|
||||||
|
|
||||||
|
The below analysis is only for connections within ggml.c
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph LR;
|
||||||
|
type_traits{"type_traits[]\n Lookup Table"}
|
||||||
|
%%type_traits-->type_name
|
||||||
|
%%type_traits-->blck_size
|
||||||
|
%%type_traits-->type_size
|
||||||
|
%%type_traits-->is_quantized
|
||||||
|
type_traits-->to_float
|
||||||
|
type_traits-->from_float
|
||||||
|
type_traits-->from_float_reference
|
||||||
|
%%type_traits-->vec_dot
|
||||||
|
%%type_traits-->vec_dot_type
|
||||||
|
%%type_traits-->nrows
|
||||||
|
|
||||||
|
ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"])
|
||||||
|
to_float --> ggml_compute_forward_add_q_f32
|
||||||
|
ggml_compute_forward_out_prod_q_f32(["ggml_compute_forward_out_prod_q_f32()"])
|
||||||
|
to_float --> ggml_compute_forward_out_prod_q_f32
|
||||||
|
ggml_compute_forward_get_rows_q(["ggml_compute_forward_get_rows_q()"])
|
||||||
|
to_float --> ggml_compute_forward_get_rows_q
|
||||||
|
ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"])
|
||||||
|
to_float --> ggml_compute_forward_flash_attn_ext_f16
|
||||||
|
|
||||||
|
ggml_compute_forward_dup_f16(["ggml_compute_forward_dup_f16()"])
|
||||||
|
from_float --> ggml_compute_forward_dup_f16
|
||||||
|
ggml_compute_forward_dup_bf16(["ggml_compute_forward_dup_bf16()"])
|
||||||
|
from_float --> ggml_compute_forward_dup_bf16
|
||||||
|
ggml_compute_forward_dup_f32(["ggml_compute_forward_dup_f32()"])
|
||||||
|
from_float --> ggml_compute_forward_dup_f32
|
||||||
|
ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"])
|
||||||
|
from_float --> ggml_compute_forward_add_q_f32
|
||||||
|
ggml_compute_forward_mul_mat(["ggml_compute_forward_mul_mat()"])
|
||||||
|
from_float --> ggml_compute_forward_mul_mat
|
||||||
|
ggml_compute_forward_mul_mat_id(["ggml_compute_forward_mul_mat_id()"])
|
||||||
|
from_float --> ggml_compute_forward_mul_mat_id
|
||||||
|
ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"])
|
||||||
|
from_float --> ggml_compute_forward_flash_attn_ext_f16
|
||||||
|
```
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user