mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-22 08:17:58 +01:00
dev note on tensor encoding LUT
parent
212923661c
commit
d677b5cb7d
115
dev-notes.md
115
dev-notes.md
@ -59,4 +59,117 @@ Aka it's for the writing/reading api.
|
||||
|
||||
There is this cpp example program that will write a test gguf write/read
|
||||
|
||||
- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp)
|
||||
- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp)
|
||||
|
||||
|
||||
### If we don't store the size tensor array elements etc in gguf where do we store these?
|
||||
|
||||
In ggml.c refer to `static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT]`
|
||||
which is a lookup table containing enough information to deduce the size of a tensor layer
|
||||
in bytes if given an offset and element dimension count.
|
||||
|
||||
One good example is shown below (but annotated for clarity):
|
||||
|
||||
```c
|
||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
...
|
||||
[GGML_TYPE_F16] = {
|
||||
// General Specs About This Tensor Encoding Scheme
|
||||
.type_name = "f16",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(ggml_fp16_t),
|
||||
.is_quantized = false,
|
||||
|
||||
// C function methods for interpreting the blocks
|
||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
|
||||
// C functions methods plus extra specs required for dot product handling
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
So basically these are used in various places to help allow the developers to
|
||||
get a sense of the tensor encoding spec and sizing as you can see with the
|
||||
getter methods below (Note didn't trace fully the other functions directly using
|
||||
the values within ggml.c, the few in this graph is just for illustrative purpose):
|
||||
|
||||
```mermaid
|
||||
graph LR;
|
||||
type_traits{"type_traits[]\n Lookup Table"}
|
||||
type_traits-->type_name
|
||||
type_traits-->blck_size
|
||||
type_traits-->type_size
|
||||
type_traits-->is_quantized
|
||||
%%type_traits-->to_float
|
||||
%%type_traits-->from_float
|
||||
%%type_traits-->from_float_reference
|
||||
%%type_traits-->vec_dot
|
||||
%%type_traits-->vec_dot_type
|
||||
%%type_traits-->nrows
|
||||
|
||||
subgraph getter functions / methods
|
||||
ggml_type_name(["ggml_type_name()"])
|
||||
ggml_blck_size(["ggml_blck_size()"])
|
||||
ggml_type_size(["ggml_type_size()"])
|
||||
ggml_is_quantized(["ggml_is_quantized()"])
|
||||
end
|
||||
type_name --> ggml_type_name(["ggml_type_name()"])
|
||||
blck_size --> ggml_blck_size(["ggml_blck_size()"])
|
||||
type_size --> ggml_type_size(["ggml_type_size()"])
|
||||
is_quantized --> ggml_is_quantized(["ggml_is_quantized()"])
|
||||
|
||||
blck_size --> ggml_type_sizef(["ggml_type_sizef()"])
|
||||
blck_size --> ggml_quantize_chunk(["ggml_quantize_chunk()"])
|
||||
```
|
||||
|
||||
This is how the LUT is used to convert a tensor data area to/from float for processing
|
||||
(However these methods is not used in the GPU if i understand as these data area is processed directly using GPU specific instruction code.
|
||||
This is also why the tensors elements has to be packed in a certain way.)
|
||||
|
||||
The below analysis is only for connections within ggml.c
|
||||
|
||||
```mermaid
|
||||
graph LR;
|
||||
type_traits{"type_traits[]\n Lookup Table"}
|
||||
%%type_traits-->type_name
|
||||
%%type_traits-->blck_size
|
||||
%%type_traits-->type_size
|
||||
%%type_traits-->is_quantized
|
||||
type_traits-->to_float
|
||||
type_traits-->from_float
|
||||
type_traits-->from_float_reference
|
||||
%%type_traits-->vec_dot
|
||||
%%type_traits-->vec_dot_type
|
||||
%%type_traits-->nrows
|
||||
|
||||
ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"])
|
||||
to_float --> ggml_compute_forward_add_q_f32
|
||||
ggml_compute_forward_out_prod_q_f32(["ggml_compute_forward_out_prod_q_f32()"])
|
||||
to_float --> ggml_compute_forward_out_prod_q_f32
|
||||
ggml_compute_forward_get_rows_q(["ggml_compute_forward_get_rows_q()"])
|
||||
to_float --> ggml_compute_forward_get_rows_q
|
||||
ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"])
|
||||
to_float --> ggml_compute_forward_flash_attn_ext_f16
|
||||
|
||||
ggml_compute_forward_dup_f16(["ggml_compute_forward_dup_f16()"])
|
||||
from_float --> ggml_compute_forward_dup_f16
|
||||
ggml_compute_forward_dup_bf16(["ggml_compute_forward_dup_bf16()"])
|
||||
from_float --> ggml_compute_forward_dup_bf16
|
||||
ggml_compute_forward_dup_f32(["ggml_compute_forward_dup_f32()"])
|
||||
from_float --> ggml_compute_forward_dup_f32
|
||||
ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"])
|
||||
from_float --> ggml_compute_forward_add_q_f32
|
||||
ggml_compute_forward_mul_mat(["ggml_compute_forward_mul_mat()"])
|
||||
from_float --> ggml_compute_forward_mul_mat
|
||||
ggml_compute_forward_mul_mat_id(["ggml_compute_forward_mul_mat_id()"])
|
||||
from_float --> ggml_compute_forward_mul_mat_id
|
||||
ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"])
|
||||
from_float --> ggml_compute_forward_flash_attn_ext_f16
|
||||
```
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user