From d677b5cb7d54891f09c6ec710bf1d0e6afcf9e4b Mon Sep 17 00:00:00 2001 From: brian khuu Date: Thu, 13 Jun 2024 11:28:39 +1000 Subject: [PATCH] dev note on tensor encoding LUT --- dev-notes.md | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/dev-notes.md b/dev-notes.md index b0c3fbb..36c1333 100644 --- a/dev-notes.md +++ b/dev-notes.md @@ -59,4 +59,117 @@ Aka it's for the writing/reading api. There is this cpp example program that will write a test gguf write/read -- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp) \ No newline at end of file +- [./example/gguf.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/gguf/gguf.cpp) + + +### If we don't store the size tensor array elements etc in gguf where do we store these? + +In ggml.c refer to `static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT]` +which is a lookup table containing enough information to deduce the size of a tensor layer +in bytes if given an offset and element dimension count. + +One good example is shown below (but annotated for clarity): + +```c +static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { +... + [GGML_TYPE_F16] = { + // General Specs About This Tensor Encoding Scheme + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, + + // C function methods for interpreting the blocks + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + + // C functions methods plus extra specs required for dot product handling + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, + .nrows = 1, + }, +... +} +``` + +So basically these are used in various places to help allow the developers to +get a sense of the tensor encoding spec and sizing as you can see with the +getter methods below (Note didn't trace fully the other functions directly using +the values within ggml.c, the few in this graph is just for illustrative purpose): + +```mermaid +graph LR; + type_traits{"type_traits[]\n Lookup Table"} + type_traits-->type_name + type_traits-->blck_size + type_traits-->type_size + type_traits-->is_quantized + %%type_traits-->to_float + %%type_traits-->from_float + %%type_traits-->from_float_reference + %%type_traits-->vec_dot + %%type_traits-->vec_dot_type + %%type_traits-->nrows + + subgraph getter functions / methods + ggml_type_name(["ggml_type_name()"]) + ggml_blck_size(["ggml_blck_size()"]) + ggml_type_size(["ggml_type_size()"]) + ggml_is_quantized(["ggml_is_quantized()"]) + end + type_name --> ggml_type_name(["ggml_type_name()"]) + blck_size --> ggml_blck_size(["ggml_blck_size()"]) + type_size --> ggml_type_size(["ggml_type_size()"]) + is_quantized --> ggml_is_quantized(["ggml_is_quantized()"]) + + blck_size --> ggml_type_sizef(["ggml_type_sizef()"]) + blck_size --> ggml_quantize_chunk(["ggml_quantize_chunk()"]) +``` + +This is how the LUT is used to convert a tensor data area to/from float for processing +(However these methods is not used in the GPU if i understand as these data area is processed directly using GPU specific instruction code. +This is also why the tensors elements has to be packed in a certain way.) + +The below analysis is only for connections within ggml.c + +```mermaid +graph LR; + type_traits{"type_traits[]\n Lookup Table"} + %%type_traits-->type_name + %%type_traits-->blck_size + %%type_traits-->type_size + %%type_traits-->is_quantized + type_traits-->to_float + type_traits-->from_float + type_traits-->from_float_reference + %%type_traits-->vec_dot + %%type_traits-->vec_dot_type + %%type_traits-->nrows + + ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"]) + to_float --> ggml_compute_forward_add_q_f32 + ggml_compute_forward_out_prod_q_f32(["ggml_compute_forward_out_prod_q_f32()"]) + to_float --> ggml_compute_forward_out_prod_q_f32 + ggml_compute_forward_get_rows_q(["ggml_compute_forward_get_rows_q()"]) + to_float --> ggml_compute_forward_get_rows_q + ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"]) + to_float --> ggml_compute_forward_flash_attn_ext_f16 + + ggml_compute_forward_dup_f16(["ggml_compute_forward_dup_f16()"]) + from_float --> ggml_compute_forward_dup_f16 + ggml_compute_forward_dup_bf16(["ggml_compute_forward_dup_bf16()"]) + from_float --> ggml_compute_forward_dup_bf16 + ggml_compute_forward_dup_f32(["ggml_compute_forward_dup_f32()"]) + from_float --> ggml_compute_forward_dup_f32 + ggml_compute_forward_add_q_f32(["ggml_compute_forward_add_q_f32()"]) + from_float --> ggml_compute_forward_add_q_f32 + ggml_compute_forward_mul_mat(["ggml_compute_forward_mul_mat()"]) + from_float --> ggml_compute_forward_mul_mat + ggml_compute_forward_mul_mat_id(["ggml_compute_forward_mul_mat_id()"]) + from_float --> ggml_compute_forward_mul_mat_id + ggml_compute_forward_flash_attn_ext_f16(["ggml_compute_forward_flash_attn_ext_f16()"]) + from_float --> ggml_compute_forward_flash_attn_ext_f16 +``` +