From ee804f6223777019cf921e0d99cc24669313ab98 Mon Sep 17 00:00:00 2001 From: Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:15:06 +0900 Subject: [PATCH 01/44] ci: apply concurrency limit for github workflows (#6243) --- .github/workflows/build.yml | 4 ++++ .github/workflows/code-coverage.yml | 4 ++++ .github/workflows/docker.yml | 4 ++++ .github/workflows/editorconfig.yml | 4 ++++ .github/workflows/nix-ci-aarch64.yml | 4 ++++ .github/workflows/nix-ci.yml | 4 ++++ .github/workflows/python-check-requirements.yml | 4 ++++ .github/workflows/python-lint.yml | 4 ++++ .github/workflows/server.yml | 4 ++++ .github/workflows/zig-build.yml | 4 ++++ 10 files changed, 40 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 469d81165..7711bd8d8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,6 +15,10 @@ on: types: [opened, synchronize, reopened] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 392db8a08..4112518bb 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -5,6 +5,10 @@ env: GGML_NLOOP: 3 GGML_N_THREADS: 1 +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: run: runs-on: ubuntu-20.04 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 94f9161fc..9591bfc2a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -15,6 +15,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: push_to_registry: name: Push Docker image to Docker Hub diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index 0e0993cd4..7b2a00c90 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -14,6 +14,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: editorconfig: runs-on: ubuntu-latest diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 8d0a3fd7f..109a793ea 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -17,6 +17,10 @@ on: types: [opened, synchronize, reopened] paths: ['**/*.nix', 'flake.lock'] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: nix-build-aarch64: runs-on: ubuntu-latest diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 01c5a9d5a..8b5b99c8f 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -8,6 +8,10 @@ on: pull_request: types: [opened, synchronize, reopened] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: nix-eval: strategy: diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index b82205992..4092b12fa 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -16,6 +16,10 @@ on: - 'requirements.txt' - 'requirements/*.txt' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: python-check-requirements: runs-on: ubuntu-latest diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index ea0a05ea1..4bdd79c4a 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -2,6 +2,10 @@ name: flake8 Lint on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: flake8-lint: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 65ca7d9ca..b74dc5e21 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -18,6 +18,10 @@ on: schedule: - cron: '0 0 * * *' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: server: runs-on: ubuntu-latest diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml index 68a698ab9..cb43954eb 100644 --- a/.github/workflows/zig-build.yml +++ b/.github/workflows/zig-build.yml @@ -6,6 +6,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: build: strategy: From dba1af612926cbd4ebe2d876277af1e3305177e0 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 22 Mar 2024 19:00:01 +0100 Subject: [PATCH 02/44] llama_model_loader: support multiple split/shard GGUFs (#6187) * split: support in llama_model_loader * avoid copying the entire vector Co-authored-by: slaren * split: move llama_tensor_offset to llama_model_loader * llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts * avoid copying the entire vector * Simplify this by making these optional, switch some layer creation tensor optional Co-authored-by: Georgi Gerganov * Handle optional tensors Co-authored-by: Georgi Gerganov * llama_model_loader: fail if backend cannot allocate buffer * fix mmap buffer management * llama_model_loader: map file to backend buffer if the allocation succeeds only * llama_model_loader: only map tensors included in the context * llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast * llama_model_loader: fail if any of backend buffer cannot be allocated * spacing Co-authored-by: slaren * fix loop over pointer Co-authored-by: slaren * llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting * llama_model_loader: ensure mappings vector has the expected size * llama_model_loader: use at instead of operator[] if this should never add to the map. * llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. * llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer * llama_model_loader: fix map -> unordered map * llama_split_prefix: use a clearer version, not pass split path len but dest max len. Co-authored-by: Xuan Son Nguyen * llama : minor ggml-ci * llama : introduce some typedef helpers * docs: add model shard in hot topic * llama_model_loader: put mapping in a unique_ptr from the moment it is allocated Co-authored-by: slaren * fix llama_split_prefix --------- Co-authored-by: slaren Co-authored-by: Georgi Gerganov Co-authored-by: Xuan Son Nguyen --- README.md | 1 + examples/gguf-split/gguf-split.cpp | 151 ++++----- llama.cpp | 474 ++++++++++++++++++++--------- llama.h | 10 + 4 files changed, 412 insertions(+), 224 deletions(-) diff --git a/README.md b/README.md index 368489caa..f9cf19616 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 +- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187 ---- diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 8e12e6493..f703588e1 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,31 +1,34 @@ #include "llama.h" -#include "ggml.h" #include "common.h" #include #include -#include #include #include -#include #include #include #include -#include #include +#include +#include + +#if defined(_WIN32) + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif enum split_operation : uint8_t { SPLIT_OP_SPLIT, SPLIT_OP_MERGE, }; -static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; -static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; - -static const int SPLIT_FILENAME_MAX = 256; - -static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; struct split_params { split_operation operation = SPLIT_OP_SPLIT; @@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para try { if (!split_params_parse_ex(argc, argv, params)) { split_print_usage(argv[0]); - exit(1); + exit(EXIT_FAILURE); } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); split_print_usage(argv[0]); - exit(1); + exit(EXIT_FAILURE); } return result; } @@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) { } } -static std::string split_file_name(const std::string & path, int i_split, int n_split) { - char f_split[SPLIT_FILENAME_MAX] = {0}; - snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); - return std::string(f_split); -} - struct split_strategy { const split_params params; std::ifstream & f_input; @@ -180,8 +177,9 @@ struct split_strategy { if (i_split == 0) { gguf_set_kv(ctx_out, ctx_gguf); } - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split); + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); // populate the original tensors, so we get an initial metadata for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { @@ -189,10 +187,11 @@ struct split_strategy { gguf_add_tensor(ctx_out, meta); } - auto split_name = split_file_name(params.output, i_split, n_split); + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); - fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); - fout = std::ofstream(split_name, std::ios::binary); + fprintf(stderr, "%s: %s ...", __func__, split_path); + fout = std::ofstream(split_path, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors auto meta_size = gguf_get_meta_size(ctx_out); @@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) { std::ifstream f_input(split_params.input.c_str(), std::ios::binary); if (!f_input.is_open()) { fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); if (!ctx_gguf) { fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + + char first_split_path[PATH_MAX] = {0}; + llama_split_path(first_split_path, sizeof(first_split_path), + split_params.output.c_str(), strategy.i_split, strategy.n_split); fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", __func__, split_params.input.c_str(), - split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), + first_split_path, split_params.n_split_tensors); strategy.split_start(); @@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) { std::vector ctx_metas; std::vector ctx_ggufs; - std::string split_prefix; + char split_path[PATH_MAX] = {0}; + strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); + char split_prefix[PATH_MAX] = {0}; // First pass to find KV and tensors metadata for (int i_split = 0; i_split < n_split; i_split++) { @@ -309,89 +314,66 @@ static void gguf_merge(const split_params & split_params) { /*.ctx = */ &ctx_meta, }; - auto split_name = split_params.input; if (i_split > 0) { - split_name = split_file_name(split_prefix, i_split, n_split); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); } - fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); - auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + auto * ctx_gguf = gguf_init_from_file(split_path, params); if (!ctx_gguf) { fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } ctx_ggufs.push_back(ctx_gguf); ctx_metas.push_back(ctx_meta); if (i_split == 0) { - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); if (key_n_split < 0) { fprintf(stderr, "\n%s: input file does not contain %s metadata\n", __func__, - LLM_KV_GENERAL_SPLIT_N_SPLIT); + LLM_KV_SPLIT_COUNT); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); if (n_split < 1) { fprintf(stderr, "\n%s: input file does not contain a valid split count %d\n", __func__, n_split); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); + } + + // Verify the file naming and extract split_prefix + if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d" + " n_split=%d\n", __func__, + split_path, i_split, n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); } // Do not trigger merge if we try to merge again the output - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); // Set metadata from the first split gguf_set_kv(ctx_out, ctx_gguf); } - // Verify the file naming - { - int i_split_file = 0; - int n_split_file = 0; - const char * i_split_format = "-00000-of-00000.gguf"; - - if (split_name.size() < strlen(i_split_format)) { - fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } - gguf_free(ctx_out); - fout.close(); - exit(1); - } - - split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); - - const char * split_name_c_str = split_name.c_str(); - int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); - - if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { - fprintf(stderr, "\n%s: unexpected input file name: %s" - " i_split=%d i_split_file=%d" - " n_split=%d n_split_file=%d\n", __func__, - split_params.input.c_str(), - i_split, i_split_file, - n_split, n_split_file); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } - gguf_free(ctx_out); - fout.close(); - exit(1); - } - } - auto n_tensors = gguf_get_n_tensors(ctx_gguf); for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); @@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) { // Write tensors data for (int i_split = 0; i_split < n_split; i_split++) { - auto split_name = split_file_name(split_prefix, i_split, n_split); - std::ifstream f_input(split_name.c_str(), std::ios::binary); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + std::ifstream f_input(split_path, std::ios::binary); if (!f_input.is_open()) { - fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); + for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { + gguf_free(ctx_ggufs[i]); + ggml_free(ctx_metas[i]); } gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); auto * ctx_gguf = ctx_ggufs[i_split]; auto * ctx_meta = ctx_metas[i_split]; @@ -481,8 +464,8 @@ int main(int argc, const char ** argv) { break; case SPLIT_OP_MERGE: gguf_merge(params); break; - default:split_print_usage(argv[0]); - exit(1); + default: split_print_usage(argv[0]); + exit(EXIT_FAILURE); } return 0; diff --git a/llama.cpp b/llama.cpp index 91bd6b8d0..aa6c89246 100644 --- a/llama.cpp +++ b/llama.cpp @@ -52,6 +52,9 @@ #define NOMINMAX #endif #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif #include #endif @@ -290,6 +293,10 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_SPLIT_NO, + LLM_KV_SPLIT_COUNT, + LLM_KV_SPLIT_TENSORS_COUNT, + LLM_KV_SSM_INNER_SIZE, LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, @@ -355,6 +362,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, @@ -1099,6 +1110,7 @@ struct llama_file { } } }; +using llama_files = std::vector>; struct llama_mmap { void * addr; @@ -1299,6 +1311,7 @@ struct llama_mmap { } #endif }; +using llama_mmaps = std::vector>; // Represents some region of memory being locked using mlock or VirtualLock; // will automatically unlock on destruction. @@ -1448,6 +1461,7 @@ struct llama_mlock { static void raw_unlock(const void * addr, size_t len) {} #endif }; +using llama_mlocks = std::vector>; static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); @@ -2023,12 +2037,12 @@ struct llama_model { // the model memory buffers for the tensor data std::vector bufs; - // model memory mapped file - std::unique_ptr mapping; + // model memory mapped files + llama_mmaps mappings; // objects representing data potentially being locked in memory - std::vector> mlock_bufs; - llama_mlock mlock_mmap; + llama_mlocks mlock_bufs; + llama_mlocks mlock_mmaps; // for quantize-stats only std::vector> tensors_by_name; @@ -2792,6 +2806,8 @@ namespace GGUFMeta { }; } +using llama_buf_map = std::unordered_map; + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -2802,54 +2818,133 @@ struct llama_model_loader { bool use_mmap = false; - llama_file file; + llama_files files; llama_ftype ftype; llama_fver fver; - std::unique_ptr mapping; + llama_mmaps mappings; + + // Holds information on a model weights + struct llama_tensor_weights { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + + ggml_tensor * tensor; + + llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, name); + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + } + }; + std::vector weights; + std::unordered_map kv_overrides; - struct gguf_context * ctx_gguf = NULL; - struct ggml_context * ctx_meta = NULL; + struct gguf_context * meta = NULL; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { + llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); } - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - if (param_overrides_p != nullptr) { for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { kv_overrides.insert({std::string(p->key), *p}); } } - ctx_gguf = gguf_init_from_file(fname.c_str(), params); - if (!ctx_gguf) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta = gguf_init_from_file(fname.c_str(), params); + if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - n_kv = gguf_get_n_kv(ctx_gguf); - n_tensors = gguf_get_n_tensors(ctx_gguf); + // Save tensors data offset of the main file. + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur)); + } + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); - fver = (enum llama_fver ) gguf_get_version(ctx_gguf); + uint16_t n_split = 0; + get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); - n_elements += ggml_nelements(t); - n_bytes += ggml_nbytes(t); + // Load additional GGML contexts + if (n_split > 1) { + uint16_t idx = 0; + get_key(llm_kv(LLM_KV_SPLIT_NO), idx); + if (idx != 0) { + throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); + } + + char split_prefix[PATH_MAX] = {0}; + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) { + throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + } + + if (trace > 0) { + LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); + } + + char split_path[PATH_MAX] = {0}; + for (idx = 1; idx < n_split; idx++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); + } + + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur)); + } + files.emplace_back(new llama_file(split_path, "rb")); + contexts.emplace_back(ctx); + + gguf_free(ctx_gguf); + } + + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + + // sanity check + { + const int n_tensors_loaded = (int) weights.size(); + if (n_tensors != n_tensors_loaded) { + throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + } + } + + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); + } + + n_kv = gguf_get_n_kv(meta); + n_tensors = weights.size(); + + fver = (enum llama_fver) gguf_get_version(meta); + + for (auto & w : weights) { + n_elements += ggml_nelements(w.tensor); + n_bytes += ggml_nbytes(w.tensor); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -2864,7 +2959,8 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); + const ggml_tensor * tensor = weights.at(i).tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -2874,8 +2970,8 @@ struct llama_model_loader { } if (trace > 0) { - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + const uint16_t sid = weights.at(i).idx; + LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } @@ -2911,22 +3007,23 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + const int kid = gguf_find_key(meta, "general.file_type"); if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta, kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx_gguf, i); - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + const char * name = gguf_get_key(meta, i); + const enum gguf_type type = gguf_get_kv_type(meta, i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(ctx_gguf, i); + std::string value = gguf_kv_to_str(meta, i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -2955,18 +3052,18 @@ struct llama_model_loader { } ~llama_model_loader() { - if (ctx_gguf) { - gguf_free(ctx_gguf); + if (meta) { + gguf_free(meta); } - if (ctx_meta) { - ggml_free(ctx_meta); + for (auto * ctx : contexts) { + ggml_free(ctx); } } template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(ctx_gguf, key.c_str()); + const int kid = gguf_find_key(meta, key.c_str()); if (kid < 0) { if (required) { @@ -2976,7 +3073,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(ctx_gguf, kid); + GGUFMeta::GKV::get_kv(meta, kid); result = arr_info.length; @@ -2996,7 +3093,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + const bool found = GGUFMeta::GKV::set(meta, key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -3019,20 +3116,33 @@ struct llama_model_loader { } const char * get_tensor_name(int i) const { - return gguf_get_tensor_name(ctx_gguf, i); + return weights.at(i).tensor->name; + } + + const llama_tensor_weights & get_weights(const char * name) const { + for (const auto & weight : weights) { + if (strcmp(name, weight.tensor->name) == 0) { + return weight; + } + } + throw std::runtime_error(format("tensor %s not found", name)); } struct ggml_tensor * get_tensor_meta(const char * name) const { - return ggml_get_tensor(ctx_meta, name); + try { + return get_weights(name).tensor; + } catch (const std::runtime_error & e) { + return NULL; + } } struct ggml_tensor * get_tensor_meta(int i) const { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - ggml_set_name(tensor, ggml_get_name(meta)); + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); + ggml_set_name(tensor, ggml_get_name(cur)); n_created++; @@ -3040,7 +3150,7 @@ struct llama_model_loader { } struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); if (cur == NULL) { if (!required) { @@ -3075,76 +3185,79 @@ struct llama_model_loader { } } - size_t file_offset(const char * name) const { - const int idx = gguf_find_tensor(ctx_gguf, name); - - if (idx < 0) { - throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); - } - - return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); - } - - void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { - // prefetch the whole file - all the data is needed anyway + void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) { if (use_mmap) { - mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + mappings.reserve(files.size()); + mmaps_used.reserve(files.size()); + for (const auto & file : files) { + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + mmaps_used.emplace_back(std::make_pair(mapping->size, 0)); + if (mlock_mmaps) { + std::unique_ptr mlock_mmap(new llama_mlock()); + mlock_mmap->init(mapping->addr); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); + } + mappings.emplace_back(std::move(mapping)); + } } // compute the total size of all tensors for progress reporting - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && mapping) { - if (lmlock) { - lmlock->init(mapping->addr); - } - mmap_used_first = mapping->size; + for (auto & w : weights) { + size_data += ggml_nbytes(w.tensor); } } - void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { - GGML_ASSERT(mapping); + void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { + GGML_ASSERT(!mappings.empty()); + const auto & mapping = mappings.at(idx); *first = mapping->size; *last = 0; + *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const size_t offs = file_offset(ggml_get_name(tensor)); - *first = std::min(*first, offs); - *last = std::max(*last, offs + ggml_nbytes(tensor)); + const auto & w = get_weights(ggml_get_name(tensor)); + if (w.idx != idx) { + continue; + } + *first = std::min(*first, w.offs); + *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } } // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { - const size_t offs = file_offset(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); - if (use_mmap && mapping) { + if (use_mmap) { + const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr + offs; + cur->data = (uint8_t *)mapping->addr + w.offs; } else { - memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur)); } } else { GGML_ASSERT(cur->data != nullptr); - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); } } size_t size_done = 0; size_t size_data = 0; - size_t mmap_used_first = -1; - size_t mmap_used_last = 0; + std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { - GGML_ASSERT(size_data != 0 && "call init_mapping() first"); + bool load_all_data( + struct ggml_context * ctx, + llama_buf_map & bufs_mmap, + llama_mlocks * lmlocks, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { @@ -3152,41 +3265,57 @@ struct llama_model_loader { } } - const size_t offs = file_offset(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); + size_t n_size = ggml_nbytes(cur); - if (use_mmap && mapping) { + if (use_mmap) { + const auto & mapping = mappings.at(w.idx); + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs_mmap.count(w.idx)) { + buf_mmap = bufs_mmap.at(w.idx); + } + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs); + if (lmlocks) { + const auto & lmlock = lmlocks->at(w.idx); + lmlock->grow_to(w.offs + ggml_nbytes(cur)); } - mmap_used_first = std::min(mmap_used_first, offs); - mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); + + auto & mmap_used = mmaps_used[w.idx]; + mmap_used.first = std::min(mmap_used.first, w.offs); + mmap_used.second = std::max(mmap_used.second, w.offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size); } } else { + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); + file->seek(w.offs, SEEK_SET); + file->read_raw(read_buf.data(), ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); } } - size_done += ggml_nbytes(cur); + size_done += n_size; } // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_used_first); - if (mmap_used_last != 0) { - mapping->unmap_fragment(mmap_used_last, mapping->size); + if (use_mmap) { + for (uint32_t idx = 0; idx < mappings.size(); idx++) { + const auto & mmap_used = mmaps_used.at(idx); + auto & mapping = mappings.at(idx); + mapping->unmap_fragment(0, mmap_used.first); + if (mmap_used.second != 0) { + mapping->unmap_fragment(mmap_used.second, mapping->size); + } } } if (progress_callback) { @@ -3319,7 +3448,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.ctx_gguf; + const gguf_context * ctx = ml.meta; // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -3709,7 +3838,7 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.ctx_gguf; + struct gguf_context * ctx = ml.meta; const auto kv = LLM_KV(model.arch); @@ -4319,10 +4448,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); - layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); - } + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); @@ -5024,56 +5151,97 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); + ml.init_mappings(true, &model.mlock_mmaps); + model.mappings.reserve(ml.mappings.size()); // create the backend buffers - std::vector> ctx_bufs; + std::vector> ctx_bufs; + ctx_bufs.reserve(ctx_map.size()); + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create + size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); + model.bufs.reserve(n_max_backend_buffer); for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = nullptr; + ggml_context * ctx = it.second; + + llama_buf_map bufs; + bufs.reserve(n_max_backend_buffer); // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend CPU buffer"); + } + model.bufs.push_back(buf); + bufs.emplace(idx, buf); #ifdef GGML_USE_CUBLAS - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( ggml_backend_buffer_get_base(buf), ggml_backend_buffer_get_size(buf)); - } + } #endif + } } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { - const size_t max_size = ggml_get_max_tensor_size(ctx); - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend metal buffer"); + } + model.bufs.push_back(buf); + bufs.emplace(idx, buf); + } } #endif else { - buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + model.bufs.push_back(buf); + if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); mlock_buf->init (ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); + } } - if (buf == nullptr) { + + if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf); - ctx_bufs.emplace_back(ctx, buf); + + for (auto & buf : bufs) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + + ctx_bufs.emplace_back(ctx, bufs); } if (llama_supports_gpu_offload()) { @@ -5105,13 +5273,15 @@ static bool llm_load_tensors( // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; - ggml_backend_buffer_t buf = it.second; - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + auto & bufs = it.second; + if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } } - model.mapping = std::move(ml.mapping); + for (auto & mapping : ml.mappings) { + model.mappings.emplace_back(std::move(mapping)); + } // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -12302,7 +12472,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #endif llama_model_loader ml(fname_inp, use_mmap, NULL); - ml.init_mapping(false); // no prefetching? + ml.init_mappings(false); // no prefetching? llama_model model; llm_load_arch(ml, model); @@ -12326,12 +12496,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.ctx_gguf); + gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = ggml_get_name(meta); @@ -12371,7 +12541,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // populate the original tensors so we get an initial meta data for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } @@ -12576,7 +12746,7 @@ static int llama_apply_lora_from_file_internal( if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(/*prefetch*/ false); // no prefetching + ml->init_mappings(/*prefetch*/ false); // no prefetching } struct tensor_meta { @@ -12697,7 +12867,7 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * base_t; if (ml) { - if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + if (!ml->get_tensor_meta(base_name.c_str())) { LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } @@ -14645,6 +14815,30 @@ LLAMA_API int32_t llama_chat_apply_template( return res; } +LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { + static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; + if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { + return strlen(split_path); + } + return 0; +} + +int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) { + std::string str_split_path(split_path); + char postfix[32]; + snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); + std::string str_postfix(postfix); + + // check if dest ends with postfix + int size_prefix = str_split_path.size() - str_postfix.size(); + if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { + snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path); + return size_prefix; + } + + return 0; +} + struct llama_timings llama_get_timings(struct llama_context * ctx) { struct llama_timings result = { /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, diff --git a/llama.h b/llama.h index 40dcf54e3..7e8ac4b62 100644 --- a/llama.h +++ b/llama.h @@ -960,6 +960,16 @@ extern "C" { int32_t n_past, int32_t n_predict); + /// @details Build a split GGUF final path for this chunk. + /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" + // Returns the split_path length. + LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); + + /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. + /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" + // Returns the split_prefix length. + LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); From 1d0331c12a2f2a6296b471232bd4e66fbf06e6a1 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:47:14 +0100 Subject: [PATCH 03/44] quantize: options for output and token embedding tensors qtype (#6239) * quantize: be able to specify the output tensor type * quantize: be able to specify the token embedding tensor type --------- Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 24 +++++++++++++++++ llama.cpp | 47 ++++++++++++++++++++-------------- llama.h | 16 +++++++----- 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 7662ec80c..79e60ea7b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file, } } +static ggml_type parse_ggml_type(const char * arg) { + ggml_type result = GGML_TYPE_COUNT; + for (int j = 0; j < GGML_TYPE_COUNT; ++j) { + auto type = ggml_type(j); + const auto * name = ggml_type_name(type); + if (name && strcmp(arg, name) == 0) { + result = type; break; + } + } + return result; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -203,6 +215,18 @@ int main(int argc, char ** argv) { for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { + if (arg_idx < argc-1) { + params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { + if (arg_idx < argc-1) { + params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { diff --git a/llama.cpp b/llama.cpp index aa6c89246..eedca802b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12141,27 +12141,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { - int nx = tensor->ne[0]; - if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { - new_type = GGML_TYPE_Q8_0; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = GGML_TYPE_Q5_K; - } - else if (new_type != GGML_TYPE_Q8_0) { - new_type = GGML_TYPE_Q6_K; + if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { + new_type = qs.params->output_tensor_type; + } else { + int nx = tensor->ne[0]; + if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + new_type = GGML_TYPE_Q5_K; + } + else if (new_type != GGML_TYPE_Q8_0) { + new_type = GGML_TYPE_Q6_K; + } } } else if (name == "token_embd.weight") { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - new_type = GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; + if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { + new_type = qs.params->token_embedding_type; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + new_type = GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_IQ3_S; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { @@ -13051,6 +13058,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.output_tensor_type =*/ GGML_TYPE_COUNT, + /*.token_embedding_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, diff --git a/llama.h b/llama.h index 7e8ac4b62..74f0e56de 100644 --- a/llama.h +++ b/llama.h @@ -275,13 +275,15 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - void * imatrix; // pointer to importance matrix data + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // itoken embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; // grammar types From 92397d87a45a09b5449d845a64856f177cd7a920 Mon Sep 17 00:00:00 2001 From: fraxy-v <65565042+fraxy-v@users.noreply.github.com> Date: Fri, 22 Mar 2024 20:49:06 +0200 Subject: [PATCH 04/44] convert-llama2c-to-ggml : enable conversion of GQA models (#6237) * convert-llama2c-to-ggml: enable conversion of multiqueries, #5608 * add test in build action * Update build.yml * Update build.yml * Update build.yml * gg patch --- .github/workflows/build.yml | 11 + examples/convert-llama2c-to-ggml/README.md | 2 + .../convert-llama2c-to-ggml.cpp | 387 ++++++++---------- 3 files changed, 193 insertions(+), 207 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7711bd8d8..bf42df8fe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -225,6 +225,17 @@ jobs: cd build ctest -L main --verbose --timeout 900 + - name: Test llama2c conversion + id: llama2c_test + run: | + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + # ubuntu-latest-cmake-sanitizer: # runs-on: ubuntu-latest # diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 0f37d295b..6da2d7e18 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface. `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` +Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). + Now you can use the model with a command like: `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8209dcb64..6b5c66530 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "llama.h" #include "common.h" +#include "log.h" #include #include @@ -78,111 +79,101 @@ typedef struct { struct TransformerWeights { // token embedding table - float* token_embedding_table; // (vocab_size, dim) + std::vector token_embedding_table; // (vocab_size, dim) // weights for rmsnorms - float* rms_att_weight; // (layer, dim) rmsnorm weights - float* rms_ffn_weight; // (layer, dim) + std::vector rms_att_weight; // (layer, dim) rmsnorm weights + std::vector rms_ffn_weight; // (layer, dim) // weights for matmuls - float* wq; // (layer, dim, dim) - float* wk; // (layer, dim, dim) - float* wv; // (layer, dim, dim) - float* wo; // (layer, dim, dim) + std::vector wq; // (layer, dim, dim) + std::vector wk; // (layer, dim, dim) + std::vector wv; // (layer, dim, dim) + std::vector wo; // (layer, dim, dim) // weights for ffn - float* w1; // (layer, hidden_dim, dim) - float* w2; // (layer, dim, hidden_dim) - float* w3; // (layer, hidden_dim, dim) + std::vector w1; // (layer, hidden_dim, dim) + std::vector w2; // (layer, dim, hidden_dim) + std::vector w3; // (layer, hidden_dim, dim) // final rmsnorm - float* rms_final_weight; // (dim,) + std::vector rms_final_weight; // (dim,) // freq_cis for RoPE relatively positional embeddings - // float* freq_cis_real; // (seq_len, dim/2) - // float* freq_cis_imag; // (seq_len, dim/2) + // std::vector freq_cis_real; // (seq_len, dim/2) + // std::vector freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer - float* wcls; - - ~TransformerWeights() { - delete[] token_embedding_table; - delete[] rms_att_weight; - delete[] rms_ffn_weight; - delete[] wq; - delete[] wk; - delete[] wv; - delete[] wo; - delete[] w1; - delete[] w2; - delete[] w3; - delete[] rms_final_weight; - delete[] wcls; - } + std::vector wcls; }; -static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { - // we calloc instead of malloc to keep valgrind happy - w->token_embedding_table = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); +static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) { + const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; + try { + w->token_embedding_table.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); - w->rms_att_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + w->rms_att_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); - w->rms_ffn_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + w->rms_ffn_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); - w->wq = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wq.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->wk = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wv = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wo = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wo.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w1.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + w->w2.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); - w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w3.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->rms_final_weight = new float[p->dim](); - printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + w->rms_final_weight.resize(p->dim); + LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); - if (shared_weights) { - w->wcls = NULL; - } else { - w->wcls = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + if (shared_weights) { + w->wcls = {}; + } else { + w->wcls.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + } + } + catch (std::length_error &) { + die("Invalid configuration. Failed to allocate memory for weights"); } } -static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { - if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; - if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast(p->n_layers * p->hidden_dim * p->dim)) return 1; - if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast(p->dim)) return 1; +static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) { + if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1; + if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1; + if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1; + if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1; + if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1; + if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1; + if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1; + if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1; + if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1; + if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1; + if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1; // Skip freq_cis_real & freq_cis_imag int head_size = p->dim / p->n_heads; fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR); - if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; + if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1; // Check we didn't forget to read anything auto curr = ftell(f); fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end); + LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo } static void print_sample_weights(TransformerWeights *w){ - printf("----- Quick print of first of the weight vales of all the variables\n"); - printf("%f\n", w->token_embedding_table[0]); - printf("%f\n", w->rms_att_weight[0]); - printf("%f\n", w->rms_ffn_weight[0]); + LOG("----- Quick print of first of the weight vales of all the variables\n"); + LOG("%f\n", w->token_embedding_table[0]); + LOG("%f\n", w->rms_att_weight[0]); + LOG("%f\n", w->rms_ffn_weight[0]); - printf("%f\n", w->wq[0]); - printf("%f\n", w->wk[0]); - printf("%f\n", w->wv[0]); - printf("%f\n", w->wo[0]); - printf("%f\n", w->w1[0]); - printf("%f\n", w->w2[0]); - printf("%f\n", w->w3[0]); - printf("%f\n", w->rms_att_weight[0]); - if (w->wcls) printf("%f\n", w->wcls[0]); + LOG("%f\n", w->wq[0]); + LOG("%f\n", w->wk[0]); + LOG("%f\n", w->wv[0]); + LOG("%f\n", w->wo[0]); + LOG("%f\n", w->w1[0]); + LOG("%f\n", w->w2[0]); + LOG("%f\n", w->w3[0]); + LOG("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -225,14 +216,16 @@ struct llama_vocab { }; struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + bool operator!=(const my_llama_hparams& other) const { return memcmp(this, &other, sizeof(my_llama_hparams)); } @@ -325,14 +318,30 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_mult: %u\n", __func__, params->n_mult); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_layer: %u\n", __func__, params->n_layer); - printf("%s: n_rot: %u\n", __func__, params->n_rot); + LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG("%s: n_embd: %u\n", __func__, params->n_embd); + LOG("%s: n_mult: %u\n", __func__, params->n_mult); + LOG("%s: n_head: %u\n", __func__, params->n_head); + LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG("%s: n_ff: %u\n", __func__, params->n_ff); + LOG("%s: n_layer: %u\n", __func__, params->n_layer); + LOG("%s: n_rot: %u\n", __func__, params->n_rot); +} + +static void print_tensor_info(const struct ggml_context * ctx) { + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + LOG("%s: Allocating ", __func__); + int64_t total = 1; + int i = 0; + for (; i < ggml_n_dims(t); ++i) { + if (i > 0) LOG("x "); + LOG("[%" PRId64 "] ", t->ne[i]); + total *= t->ne[i]; + } + if (i > 1) LOG("= [%" PRId64 "] ", total); + LOG("float space for %s\n", ggml_get_name(t)); + } } static void init_model(struct my_llama_model * model) { @@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv; + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; @@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) { model->train_tokens = 0; model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); - - // printing the per-layer allocations here so we dont print in the for loop. - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - - printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer); - - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->norm, "norm.weight"); @@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) { layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); @@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) { ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); } + + print_tensor_info(ctx); } static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { @@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %f", p); + LOG(" %f", p); } - printf("\n"); + LOG("\n"); } static void print_matrix(struct ggml_tensor * probs) { @@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) { for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); + LOG(" %.2f", p); } - printf("\n"); + LOG("\n"); } } -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; @@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { +static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { + LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + if (n_vocab != static_cast(config->vocab_size)) { + die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size); + } vocab->id_to_token.resize(n_vocab); @@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab gguf_free(ctx); } else { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); + LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab } static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { - int ct; - switch (ggml_n_dims(gg_weights)) { - case 1: - ct = 0; - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]); - *ptr = karpathy_weights[ct]; - ct++; - } - break; - case 2: - ct = 0; - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - break; - case 3: - ct = 0; - for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) { - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - } - break; + int size = 1; + for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) { + size *= gg_weights->ne[dim]; + } + for (int ct = 0; ct < size; ++ct) { + int64_t i0 = 0; int64_t i1 = 0; + int64_t i2 = 0; int64_t i3 = 0; + ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3); + ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]); } } @@ -679,16 +635,18 @@ static void save_as_llama_model( // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); - convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data()); + convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data()); - convert_weights_ak_to_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data()); //print_row(model->norm, 0); // for rms-att-weight int row_length = model->hparams.n_embd; int n_ff = model->hparams.n_ff; + const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv; + for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d @@ -697,9 +655,10 @@ static void save_as_llama_model( // from 3d matrix layer x dim x dim to 2d matrix dim x dim convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); + // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]); convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); @@ -736,8 +695,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); - // n_head_kv is optional, default to n_head - // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv); gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); @@ -789,12 +748,12 @@ static void save_as_llama_model( static struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; - params.fn_model_out = "ggml-checkpoint-f32.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; params.seed = -1; @@ -829,8 +788,8 @@ static struct train_params get_default_train_params() { params.adam_alpha = 1e-3f; params.adam_decay = 1e-3f; - params.mem_model_gb = 2; - params.mem_compute_gb = 24; + params.mem_model_gb = 2; + params.mem_compute_gb = 24; params.mem_compute0_gb = 8; params.mem_compute1_gb = 2; @@ -916,19 +875,30 @@ int main(int argc, char ** argv) { if (!params_parse(argc, argv, ¶ms)) { return 1; } + log_set_target(stdout); Config config; TransformerWeights weights = {}; { - FILE *file = fopen(params.fn_llama2c_model, "rb"); - if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } + LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + FILE *file = fopen(params.fn_llama2c_model, "r"); + if (!file) { + LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + return 1; + } // read in the config header - if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } + if (fread(&config, sizeof(Config), 1, file) != 1) { + LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + return 1; + } auto shared_weights = config.vocab_size > 0; config.vocab_size = abs(config.vocab_size); // read in the Transformer weights - malloc_weights(&weights, &config, shared_weights); - if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; } + alloc_weights(&weights, &config, shared_weights); + if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { + LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + return 1; + } fclose(file); } @@ -936,15 +906,18 @@ int main(int argc, char ** argv) { load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; - model.hparams.n_embd = config.dim; //params.n_embd; - model.hparams.n_ff = config.hidden_dim; - model.hparams.n_mult = 32;//params.n_mult; - model.hparams.n_head = config.n_heads; //params.n_head; - model.hparams.n_layer = config.n_layers; //params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; + model.hparams.n_mult = 32;//params.n_mult; + model.hparams.n_head = config.n_heads; //params.n_head; + model.hparams.n_head_kv = config.n_kv_heads; + model.hparams.n_layer = config.n_layers; //params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; @@ -956,7 +929,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; From 56a00f0a2f48a85376f48b5ce77699df781631ae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Mar 2024 21:10:39 +0200 Subject: [PATCH 05/44] common : default --hf-file to --model (#6234) --- common/common.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 0cc4859f1..de6eb960a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1220,9 +1220,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { throw std::invalid_argument("error: unknown argument: " + arg); } } + if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } + if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { @@ -1230,6 +1232,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } + // short-hand to avoid specifying --hf-file -> default it to --model + if (!params.hf_repo.empty() && params.hf_file.empty()) { + params.hf_file = params.model; + } + if (params.escape) { process_escapes(params.prompt); process_escapes(params.input_prefix); From 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 23 Mar 2024 01:24:36 +0100 Subject: [PATCH 06/44] lookup: complement data from context with general text statistics (#5479) * lookup: evaluation tools, use corpus/previous gens * fixup! lookup: evaluation tools, use corpus/previous gens * fixup! lookup: evaluation tools, use corpus/previous gens * fixup! lookup: evaluation tools, use corpus/previous gens * fixup! lookup: evaluation tools, use corpus/previous gens --- .gitignore | 3 + Makefile | 13 +- common/CMakeLists.txt | 2 + common/common.cpp | 20 +++ common/common.h | 28 +-- common/ngram-cache.cpp | 280 ++++++++++++++++++++++++++++++ common/ngram-cache.h | 94 ++++++++++ examples/lookup/CMakeLists.txt | 18 ++ examples/lookup/lookup-create.cpp | 43 +++++ examples/lookup/lookup-merge.cpp | 47 +++++ examples/lookup/lookup-stats.cpp | 163 +++++++++++++++++ examples/lookup/lookup.cpp | 116 ++++++++----- scripts/get-wikitext-103.sh | 10 ++ 13 files changed, 774 insertions(+), 63 deletions(-) create mode 100644 common/ngram-cache.cpp create mode 100644 common/ngram-cache.h create mode 100644 examples/lookup/lookup-create.cpp create mode 100644 examples/lookup/lookup-merge.cpp create mode 100644 examples/lookup/lookup-stats.cpp create mode 100755 scripts/get-wikitext-103.sh diff --git a/.gitignore b/.gitignore index 51aa84222..27562f6d7 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,9 @@ models-mnt /llava-cli /lookahead /lookup +/lookup-create +/lookup-merge +/lookup-stats /main /metal /passkey diff --git a/Makefile b/Makefile index fa112e708..b8b261aba 100644 --- a/Makefile +++ b/Makefile @@ -676,6 +676,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t train.o: common/train.cpp common/train.h $(CXX) $(CXXFLAGS) -c $< -o $@ +ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) @@ -683,7 +686,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: - rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) find examples pocs -type f -name "*.o" -delete # @@ -813,9 +816,15 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS) passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 10951693a..1d840e5f7 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -65,6 +65,8 @@ add_library(${TARGET} STATIC json.hpp train.h train.cpp + ngram-cache.h + ngram-cache.cpp ) if (BUILD_SHARED_LIBS) diff --git a/common/common.cpp b/common/common.cpp index de6eb960a..69c2d5bf7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -963,6 +963,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, } return true; } + if (arg == "-lcs" || arg == "--lookup-cache-static") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_static = argv[i]; + return true; + } + if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_dynamic = argv[i]; + return true; + } if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { if (++i >= argc) { invalid_param = true; @@ -1436,6 +1452,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" Hugging Face model file (default: unused)\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); + printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); + printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); + printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); diff --git a/common/common.h b/common/common.h index d827d4df7..afa4cf6d7 100644 --- a/common/common.h +++ b/common/common.h @@ -88,20 +88,22 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = "models/7B/ggml-model-f16.gguf"; // model path - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with + std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model_draft = ""; // draft model for speculative decoding + std::string model_alias = "unknown"; // model alias + std::string model_url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file + std::string prompt = ""; + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted - std::string logdir = ""; // directory in which to save YAML log files - std::string logits_file = ""; // file for saving *all* logits + std::string logdir = ""; // directory in which to save YAML log files + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding + std::string logits_file = ""; // file for saving *all* logits std::vector kv_overrides; diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp new file mode 100644 index 000000000..20703d306 --- /dev/null +++ b/common/ngram-cache.cpp @@ -0,0 +1,280 @@ +#include "ngram-cache.h" +#include "log.h" + +#include + +void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, + std::vector & inp, int nnew, bool print_progress) { + const int64_t t_start_ms = ggml_time_ms(); + const int64_t inp_size = inp.size(); + + const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1); + int64_t n_done = 0; + + for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) { + const int64_t i_start = std::max(inp_size - nnew, ngram_size); + for (int64_t i = i_start; i < inp_size; ++i) { + const int64_t ngram_start = i - ngram_size; + llama_ngram ngram(&inp[ngram_start], ngram_size); + const llama_token token = inp[i]; + + llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); + if (part_it == ngram_cache.end()) { + llama_ngram_cache_part part; + part.emplace(token, 1); + ngram_cache.emplace(ngram, part); + } else { + llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + if (token_count_it == part_it->second.end()) { + part_it->second.emplace(token, 1); + } else { + token_count_it->second++; + } + } + ++n_done; + + if (print_progress && n_done % 10000000 == 0) { + const int64_t t_now_ms = ggml_time_ms(); + const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done; + const int64_t eta_min = eta_ms / (60*1000); + const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; + + fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s); + } + } + } +} + +// Helper function to get a token from the combined, speculative sequence of inp and draft. +static llama_token get_token(const std::vector & inp, const std::vector & draft, const size_t i) { + return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; +} + +// If sample size or percentage are below these thresholds the draft is aborted early: +constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; +constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; +constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; +constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; + +// Helper function that tries to draft a token from only the static ngram cache: +static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + if (part_static_it == nc_static.end()) { + return -1; + } + const llama_ngram_cache_part part_static = part_static_it->second; + + int max_count_static = 0; + int sum_count_static = 0; + llama_token max_token = -1; + + for (std::pair token_count_static : part_static) { + const llama_token token = token_count_static.first; + const int32_t count_static = token_count_static.second; + + if (count_static > max_count_static) { + max_token = token; + max_count_static = count_static; + } + sum_count_static += count_static; + } + + if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { + return -1; + } + if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { + return -1; + } + return max_token; +} + +// Try to draft a token from primary cache (context/dynamic), validate with static cache: +static llama_token try_draft( + llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, + const int * min_sample_size, const int * min_percent) { + + llama_token drafted_token = -1; + + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + const llama_ngram ngram_primary = ngrams_primary[i]; + + llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + if (part_primary_it == nc_primary.end()) { + continue; + } + const llama_ngram_cache_part part_primary = part_primary_it->second; + + int max_count_primary = 0; + int max_count_static = 0; + int sum_count_primary = 0; + llama_token max_token = -1; + + for (std::pair token_count_primary : part_primary) { + const llama_token token = token_count_primary.first; + + llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + + const int32_t count_primary = token_count_primary.second; + const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; + + if (count_primary*count_static > max_count_primary*max_count_static) { + max_token = token; + max_count_primary = count_primary; + max_count_static = count_static; + } + sum_count_primary += count_primary; + } + + if (sum_count_primary < min_sample_size[i]) { + continue; + } + if (100*max_count_primary < min_percent[i]*sum_count_primary) { + continue;; + } + drafted_token = max_token; + } + + return drafted_token; +} + +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static +) { + GGML_ASSERT(draft.size() == 1); + const int inp_size = inp.size(); + + if (inp_size < LLAMA_NGRAM_STATIC) { + return; + } + + while ((int) draft.size()-1 < n_draft) { + llama_token drafted_token = -1; + + const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; + llama_ngram ngram_static; + for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { + ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); + } + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + llama_ngram_cache_part part_static; + if (part_static_it != nc_static.end()) { + part_static = part_static_it->second; + } + + // cd = context + dynamic + std::vector ngrams_cd; + for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { + const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; + llama_ngram ngram_cd; + for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { + ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); + } + ngrams_cd.push_back(ngram_cd); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_static, ngram_static); + } + + if (drafted_token == -1) { + break; + } + + LOG(" - draft candidate: token=%d\n", drafted_token); + draft.push_back(drafted_token); + } +} + +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { + std::ofstream file_out(filename, std::ios::binary); + for (std::pair item : ngram_cache) { + const llama_ngram ngram = item.first; + llama_ngram_cache_part token_counts = item.second; + GGML_ASSERT(!token_counts.empty()); + const int32_t ntokens = token_counts.size(); + GGML_ASSERT(ntokens > 0); + + file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); + file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); + for (std::pair item2 : token_counts) { + const llama_token token = item2.first; + const int32_t count = item2.second; + GGML_ASSERT(count > 0); + + file_out.write(reinterpret_cast(&token), sizeof(llama_token)); + file_out.write(reinterpret_cast(&count), sizeof(int32_t)); + } + } + +} + +llama_ngram_cache llama_ngram_cache_load(std::string & filename) { + std::ifstream hashmap_file(filename, std::ios::binary); + if (!hashmap_file) { + throw std::ifstream::failure("Unable to open file " + filename); + } + llama_ngram_cache ngram_cache; + + llama_ngram ngram; + int32_t ntokens; + llama_token token; + int32_t count; + + char * ngramc = reinterpret_cast(&ngram); + char * ntokensc = reinterpret_cast(&ntokens); + char * tokenc = reinterpret_cast(&token); + char * countc = reinterpret_cast(&count); + while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); + GGML_ASSERT(ntokens > 0); + llama_ngram_cache_part token_counts; + + for (int i = 0; i < ntokens; ++i) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); + GGML_ASSERT(count > 0); + token_counts.emplace(token, count); + } + + ngram_cache.emplace(ngram, token_counts); + } + GGML_ASSERT(hashmap_file.eof()); + + return ngram_cache; +} + +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const llama_ngram ngram = ngram_part.first; + llama_ngram_cache_part part = ngram_part.second; + + llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + if (part_merged_it == ngram_cache_target.end()) { + ngram_cache_target.emplace(ngram, part); + continue; + } + + for (std::pair token_count : part) { + const llama_token token = token_count.first; + const int32_t count = token_count.second; + GGML_ASSERT(count > 0); + + llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + if (token_count_merged_it == part_merged_it->second.end()) { + part_merged_it->second.emplace(token, count); + continue; + } + + token_count_merged_it->second += count; + } + } +} diff --git a/common/ngram-cache.h b/common/ngram-cache.h new file mode 100644 index 000000000..e4fa4cbd1 --- /dev/null +++ b/common/ngram-cache.h @@ -0,0 +1,94 @@ +#pragma once + +#include "llama.h" + +#include +#include +#include + +#define LLAMA_NGRAM_MIN 1 +#define LLAMA_NGRAM_MAX 4 +#define LLAMA_NGRAM_STATIC 2 + +// Data structures to map n-grams to empirical token probabilities: + +struct llama_ngram { + llama_token tokens[LLAMA_NGRAM_MAX]; + + llama_ngram() { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = -1; + } + } + + llama_ngram(const llama_token * input, const int ngram_size) { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = i < ngram_size ? input[i] : -1; + } + } + + bool operator==(const llama_ngram & other) const { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + if (tokens[i] != other.tokens[i]) { + return false; + } + } + return true; + } +}; + +struct llama_ngram_hash_function { + size_t operator()(const llama_ngram & ngram) const { + size_t hash = 0; + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= std::hash{}(ngram.tokens[i]); + } + return hash; + } +}; + +// token -> number of times token has been seen +typedef std::unordered_map llama_ngram_cache_part; + +// n-gram -> empirical distribution of following tokens +typedef std::unordered_map llama_ngram_cache; + + +// Update an ngram cache with tokens. +// ngram_cache: the cache to modify. +// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data. +// inp_data: the token sequence with which to update ngram_cache. +// nnew: how many new tokens have been appended to inp_data since the last call to this function. +// print_progress: whether to print progress to stderr. +// +// In order to get correct results inp_data can ONLY BE APPENDED TO. +// Changes in the middle need a complete rebuild. +void llama_ngram_cache_update( + llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); + +// Try to draft tokens from ngram caches. +// inp: the tokens generated so far. +// draft: the token sequence to draft. Expected to initially contain the previously sampled token. +// n_draft: maximum number of tokens to add to draft. +// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic. +// nc_context: ngram cache based on current context. +// nc_dynamic: ngram cache based on previous user generations. +// nc_static: ngram cache generated from a large text corpus, used for validation. +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); + +// Save an ngram cache to a file. +// ngram_cache: the ngram cache to save. +// filename: the path under which to save the ngram cache. +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); + +// Load an ngram cache saved with llama_ngram_cache_save. +// filename: the path from which to load the ngram cache. +// returns: an ngram cache containing the information saved to filename. +llama_ngram_cache llama_ngram_cache_load(std::string & filename); + +// Merge two ngram caches. +// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. +// ngram_cache_add: the ngram cache to add to ngram_cache_target. +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index c060b8f56..b91633f63 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -3,3 +3,21 @@ add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-create) +add_executable(${TARGET} lookup-create.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-merge) +add_executable(${TARGET} lookup-merge.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-stats) +add_executable(${TARGET} lookup-stats.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp new file mode 100644 index 000000000..46a6bed07 --- /dev/null +++ b/examples/lookup/lookup-create.cpp @@ -0,0 +1,43 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv){ + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + GGML_ASSERT(model != nullptr); + + // tokenize the prompt + const bool add_bos = llama_should_add_bos_token(model); + + std::vector inp; + inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + fprintf(stderr, "%s: tokenization done\n", __func__); + + + llama_ngram_cache ngram_cache; + llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); + fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); + + llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); +} diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp new file mode 100644 index 000000000..07c93eb8d --- /dev/null +++ b/examples/lookup/lookup-merge.cpp @@ -0,0 +1,47 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include +#include + +static void print_usage() { + fprintf(stderr, "Merges multiple lookup cache files into a single one.\n"); + fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n"); +} + +int main(int argc, char ** argv){ + if (argc < 3) { + print_usage(); + exit(1); + } + + std::vector args; + args.resize(argc-1); + for (int i = 0; i < argc-1; ++i) { + args[i] = argv[i+1]; + if (args[i] == "-h" || args[i] == "--help") { + print_usage(); + exit(0); + } + } + + fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); + llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); + + for (size_t i = 1; i < args.size()-1; ++i) { + fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); + llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); + + llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); + } + + fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); + llama_ngram_cache_save(ngram_cache_merged, args.back()); +} diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp new file mode 100644 index 000000000..31f227773 --- /dev/null +++ b/examples/lookup/lookup-stats.cpp @@ -0,0 +1,163 @@ +#include "ggml.h" +#include "common.h" +#include "llama.h" +#include "log.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv){ + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + const int n_draft = params.n_draft; + + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_set_rng_seed(ctx, params.seed); + GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); + + // tokenize the prompt + const bool add_bos = llama_should_add_bos_token(model); + LOG("add_bos tgt: %d\n", add_bos); + + std::vector inp; + inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; + int64_t t_draft_us = 0; + + { + const int64_t t_start_draft_us = ggml_time_us(); + + if (!params.lookup_cache_static.empty()) { + try { + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + } catch (std::ifstream::failure const &) { + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + exit(1); + } + } + + if (!params.lookup_cache_dynamic.empty()) { + try { + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + } + + t_draft_flat_us += ggml_time_us() - t_start_draft_us; + } + + const int n_input = inp.size(); + const int n_ctx = params.n_ctx; + + int n_drafted = 0; + int n_accept = 0; + + const int64_t t_start_ms = ggml_time_ms(); + + // Iterate over input tokens in chunks of size n_ctx. + // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility. + for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) { + const std::vector inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx); + std::vector pseudo_output; + pseudo_output.push_back(inp_slice[0]); + + while ((int) pseudo_output.size() < n_ctx) { + // Simulate drafting and decoding from draft: + std::vector draft; + draft.push_back(pseudo_output.back()); + + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + + n_drafted += draft.size() - 1; + + for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) { + const llama_token ground_truth = inp_slice[pseudo_output.size()]; + const llama_token drafted = draft[j]; + + if (ground_truth != drafted) { + break; + } + + ++n_accept; + pseudo_output.push_back(ground_truth); + + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + } + + // After each simulated batch decoding simulate the sampling of a single token: + if ((int) pseudo_output.size() < n_ctx) { + pseudo_output.push_back(inp_slice[pseudo_output.size()]); + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + } + + draft.erase(draft.begin()); + + } + if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) { + const int64_t t_now_ms = ggml_time_ms(); + const int64_t eta_ms = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start; + const int64_t eta_min = eta_ms / (60*1000); + const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; + + LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + } + + // After each chunk, update the dynamic ngram cache with the context ngram cache: + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + ngram_cache_context.clear(); + } + + LOG_TEE("\n"); + + LOG_TEE("\n"); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index b53fae110..2e8c35de3 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,12 +1,15 @@ -#include "common.h" #include "ggml.h" #include "llama.h" +#include "common.h" +#include "ngram-cache.h" #include #include #include +#include #include #include +#include int main(int argc, char ** argv){ gpt_params params; @@ -15,11 +18,7 @@ int main(int argc, char ** argv){ return 1; } - // max/min n-grams size to search for in prompt - const int ngram_max = 4; - const int ngram_min = 1; - - // length of the candidate / draft sequence, if match is found + // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; @@ -39,6 +38,8 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_set_rng_seed(ctx, params.seed); + GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt const bool add_bos = llama_should_add_bos_token(model); @@ -47,6 +48,35 @@ int main(int argc, char ** argv){ std::vector inp; inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; + int64_t t_draft_us = 0; + + { + // Fill up context ngram cache with tokens from user input: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); + + if (!params.lookup_cache_static.empty()) { + try { + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + } catch (std::ifstream::failure const &) { + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + exit(1); + } + } + + if (!params.lookup_cache_dynamic.empty()) { + try { + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + } + + t_draft_flat_us += ggml_time_us() - t_start_draft_us; + } + const int max_context_size = llama_n_ctx(ctx); const int max_tokens_list_size = max_context_size - 4; @@ -76,8 +106,6 @@ int main(int argc, char ** argv){ int n_drafted = 0; int n_accept = 0; - int64_t t_draft_us = 0; - int n_past = inp.size(); bool has_eos = false; @@ -129,6 +157,12 @@ int main(int argc, char ** argv){ ++n_past; ++i_dft; inp.push_back(id); + { + // Update context ngram cache with the newly accepted token: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } if (params.use_color) { // color accepted draft token @@ -149,6 +183,12 @@ int main(int argc, char ** argv){ draft.clear(); draft.push_back(id); inp.push_back(id); + { + // Update context ngram cache with the newly accepted token: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } break; } @@ -163,44 +203,19 @@ int main(int argc, char ** argv){ llama_batch_clear(batch_tgt); llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); - // generate n_pred tokens through prompt lookup - auto prompt_lookup = [&]() -> void { - const int inp_size = inp.size(); - for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ - const llama_token * ngram = &inp[inp_size - ngram_size]; - - for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) { - bool match = true; - for (int j = 0; j < ngram_size; ++j) { - if (inp[i + j] != ngram[j]) { - match = false; - break; - } - } - - if (match) { - const int startIdx = i + ngram_size; - const int endIdx = startIdx + n_draft; - if (endIdx < inp_size) { - for (int j = startIdx; j < endIdx; ++j) { - LOG(" - draft candidate %d: %d\n", j, inp[j]); - draft.push_back(inp[j]); - llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true); - ++n_drafted; - } - return; - } - } - } - } - return; - }; - + // Draft already contains a single token sampled from the model: + GGML_ASSERT(draft.size() == 1); + GGML_ASSERT(draft[0] == inp.back()); const int64_t t_start_draft_us = ggml_time_us(); - prompt_lookup(); + llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + + for (size_t i = 1; i < draft.size(); ++i) { + llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + } t_draft_us += ggml_time_us() - t_start_draft_us; + n_drafted += draft.size() - 1; llama_decode(ctx, batch_tgt); ++n_past; @@ -210,19 +225,24 @@ int main(int argc, char ** argv){ auto t_dec_end = ggml_time_us(); + // Update dynamic ngram cache with context ngram cache and save it to disk: + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); + LOG_TEE("\n\n"); LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n"); llama_print_timings(ctx); diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh new file mode 100755 index 000000000..880dd5cbe --- /dev/null +++ b/scripts/get-wikitext-103.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip + +echo "Usage:" +echo "" +echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo "" + +exit 0 From 1b26aebe4de4f048ac99996efd8a2c9af150904d Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 23 Mar 2024 13:18:45 +0100 Subject: [PATCH 07/44] server: flush stdout after logging in both text and json layout (#6253) --- examples/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d7eef556a..8f20ff614 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line, const std::string str = ss.str(); printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); } + fflush(stdout); } // From 21cad01b6e6e1a96f99391f95e8ea8ae25c8288e Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 23 Mar 2024 17:18:13 +0100 Subject: [PATCH 08/44] split: add gguf-split in the make build target (#6262) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b8b261aba..4f260cc3d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests From 476b0251b27fb64c575507024a671e639d675594 Mon Sep 17 00:00:00 2001 From: Julius Arkenberg Date: Sat, 23 Mar 2024 17:41:53 +0100 Subject: [PATCH 09/44] llama : add grok-1 support (#6204) * Add support for Grok model architecture * Revert convert-hf-to-gguf to default options * Fixed f_norm_rms_eps bug * Fix whitespaces * llama : fix grok rope type * llama : minor --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 26 +++ gguf-py/gguf/constants.py | 24 +++ gguf-py/gguf/tensor_mapping.py | 55 +++--- llama.cpp | 299 +++++++++++++++++++++++++++++++++ 4 files changed, 384 insertions(+), 20 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1e49d56c1..723ea18e3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -93,31 +93,42 @@ class Model(ABC): if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) + print(f"gguf: context length = {n_ctx}") n_embd = self.find_hparam(["hidden_size", "n_embd"]) self.gguf_writer.add_embedding_length(n_embd) + print(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) + print(f"gguf: feed forward length = {n_ff}") n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_head_count(n_head) + print(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) + print(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) + print(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + print(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) + print(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) + print(f"gguf: expert count = {n_experts}") if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) + print(f"gguf: experts used count = {n_experts_used}") self.gguf_writer.add_file_type(self.ftype) + print(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -1051,6 +1062,21 @@ class MixtralModel(Model): self._set_vocab_sentencepiece() +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_name("Grok") + + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4a4facb06..e47896e2a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -100,6 +100,7 @@ class MODEL_ARCH(IntEnum): LLAMA = auto() FALCON = auto() BAICHUAN = auto() + GROK = auto() GPT2 = auto() GPTJ = auto() GPTNEOX = auto() @@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTNEOX: "gptneox", @@ -251,6 +253,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.GROK: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.GPTNEOX: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ed89955d8..11fd34b8b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -23,6 +23,7 @@ class TensorNameMap: "model.embedding", # mamba-qbert "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok ), # Token type embeddings @@ -66,6 +67,7 @@ class TensorNameMap: "lm_head.ln", # phi2 "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok ), # Rope frequencies @@ -93,6 +95,7 @@ class TensorNameMap: "model.layers.{bid}.attention_norm", # internlm2 "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok ), # Attention norm 2 @@ -116,32 +119,35 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq" # internlm2 + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok ), # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk" # internlm2 + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok ), # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv" # internlm2 + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok ), # Attention output @@ -162,12 +168,14 @@ class TensorNameMap: "model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.{bid}.attention.wo", # internlm2 "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok ), # Rotary embeddings @@ -190,11 +198,13 @@ class TensorNameMap: "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "transformer.decoder_layer.{bid}.router" # Grok ), # Feed-forward up @@ -223,6 +233,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral + "transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok ), # AWQ-activation gate @@ -243,6 +254,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral + "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok ), # Feed-forward down @@ -270,6 +282,8 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral + "transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok + ), MODEL_TENSOR.ATTN_Q_NORM: ( @@ -287,8 +301,9 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index eedca802b..4e08be18d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -195,6 +195,7 @@ enum llm_arch { LLM_ARCH_LLAMA, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, + LLM_ARCH_GROK, LLM_ARCH_GPT2, LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, @@ -224,6 +225,7 @@ enum llm_arch { static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTNEOX, "gptneox" }, @@ -494,6 +496,28 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GROK, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + }, + }, { LLM_ARCH_GPT2, { @@ -1635,6 +1659,7 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, MODEL_LARGE, @@ -3419,6 +3444,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; + case MODEL_314B: return "314B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; @@ -3557,6 +3583,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GROK: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 64: model.type = e_model::MODEL_314B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -4394,6 +4429,54 @@ static bool llm_load_tensors( } } } break; + case LLM_ARCH_GROK: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}); + + GGML_ASSERT(hparams.n_expert > 0); + GGML_ASSERT(hparams.n_expert_used > 0); + + // MoE branch + for (uint32_t x = 0; x < hparams.n_expert; ++x) { + layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); + } + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + } + } break; case LLM_ARCH_BAICHUAN: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -5621,6 +5704,20 @@ static struct ggml_tensor * llm_build_kqv( ggml_mul_mat_set_prec(kq, GGML_PREC_F32); } + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + //try from phi2 + //ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx, kq, 30); + } + #if defined(GGML_USE_KOMPUTE) #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute") #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024") @@ -6395,6 +6492,203 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_grok() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // multiply by embedding_multiplier_scale of 78.38367176906169 + inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + // Grok + // if attn_out_norm is present then apply it before adding the input + if (model.layers[il].attn_out_norm) { + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_out_norm", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + cb(probs, "ffn_moe_probs", il); + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); + cb(weights, "ffn_moe_weights", il); + + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + cb(weights, "ffn_moe_weights_norm", il); + + // compute expert outputs + ggml_tensor * moe_out = nullptr; + + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert; + + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); + cb(cur_up, "ffn_moe_up", il); + + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); + cb(cur_gate, "ffn_moe_gate", il); + + //GeLU + cur_gate = ggml_gelu(ctx0, cur_gate); + cb(cur_gate, "ffn_moe_gelu", il); + + cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] + cb(cur_expert, "ffn_moe_gate_par", il); + + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cb(cur_expert, "ffn_moe_down", il); + + cur_expert = ggml_mul(ctx0, cur_expert, + ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + cb(cur_expert, "ffn_moe_weighted", il); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + cb(moe_out, "ffn_moe_out", il); + } + } + + cur = moe_out; + + // Grok + // if layer_out_norm is present then apply it before adding the input + // Idea: maybe ffn_out_norm is a better name + if (model.layers[il].layer_out_norm) { + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].layer_out_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "layer_out_norm", il); + } + + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + // Grok + // multiply logits by output_multiplier_scale of 0.5773502691896257 + + cur = ggml_scale(ctx0, cur, 0.5773502691896257f); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_starcoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -8818,6 +9112,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_falcon(); } break; + case LLM_ARCH_GROK: + { + result = llm.build_grok(); + } break; case LLM_ARCH_STARCODER: { result = llm.build_starcoder(); @@ -13561,6 +13859,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: + case LLM_ARCH_GROK: case LLM_ARCH_PERSIMMON: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: From 1997577d5e121568ae39f538021733ccd4278c23 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 23 Mar 2024 18:00:38 +0100 Subject: [PATCH 10/44] server: docs: `--threads` and `--threads`, `--ubatch-size`, `--log-disable` (#6254) --- examples/server/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 355601ff4..8aa4eac9d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -16,8 +16,8 @@ The project is under active development, and we are [looking for feedback and co **Command line options:** -- `--threads N`, `-t N`: Set the number of threads to use during generation. -- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. +- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). @@ -26,7 +26,8 @@ The project is under active development, and we are [looking for feedback and co - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. +- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. @@ -57,7 +58,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- `--log-disable`: Output logs to stdout only, default: enabled. +- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled. - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) **If compiled with `LLAMA_SERVER_SSL=ON`** From f482bb2e4920e544651fb832f2e0bcb4d2ff69ab Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 23 Mar 2024 18:07:00 +0100 Subject: [PATCH 11/44] common: llama_load_model_from_url split support (#6192) * llama: llama_split_prefix fix strncpy does not include string termination common: llama_load_model_from_url: - fix header name case sensitive - support downloading additional split in parallel - hide password in url * common: EOL EOF * common: remove redundant LLAMA_CURL_MAX_PATH_LENGTH definition * common: change max url max length * common: minor comment * server: support HF URL options * llama: llama_model_loader fix log * common: use a constant for max url length * common: clean up curl if file cannot be loaded in gguf * server: tests: add split tests, and HF options params * common: move llama_download_hide_password_in_url inside llama_download_file as a lambda * server: tests: enable back Release test on PR * spacing Co-authored-by: Georgi Gerganov * spacing Co-authored-by: Georgi Gerganov * spacing Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- .github/workflows/server.yml | 1 - common/common.cpp | 205 ++++++++++++++---- common/common.h | 7 + examples/gguf-split/gguf-split.cpp | 4 - examples/server/README.md | 4 +- examples/server/server.cpp | 18 +- .../server/tests/features/parallel.feature | 3 +- examples/server/tests/features/server.feature | 4 +- examples/server/tests/features/steps/steps.py | 13 +- llama.cpp | 4 +- 10 files changed, 200 insertions(+), 63 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index b74dc5e21..f07d25536 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -35,7 +35,6 @@ jobs: include: - build_type: Release sanitizer: "" - disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken container: diff --git a/common/common.cpp b/common/common.cpp index 69c2d5bf7..fb80d4bf7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -39,6 +39,9 @@ #endif #if defined(LLAMA_USE_CURL) #include +#include +#include +#include #endif #if defined(_MSC_VER) @@ -61,7 +64,7 @@ #else #include #endif -#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_CURL_MAX_HEADER_LENGTH 256 #endif // LLAMA_USE_CURL @@ -1702,27 +1705,13 @@ void llama_batch_add( #ifdef LLAMA_USE_CURL -struct llama_model * llama_load_model_from_url( - const char * model_url, - const char * path_model, - const struct llama_model_params & params) { - // Basic validation of the model_url - if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); - return NULL; - } - - // Initialize libcurl globally - auto curl = curl_easy_init(); - - if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); - return NULL; - } +static bool llama_download_file(CURL * curl, const char * url, const char * path) { + bool force_download = false; // Set the URL, allow to follow http redirection - curl_easy_setopt(curl, CURLOPT_URL, model_url); + curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // operating system. Currently implemented under MS-Windows. @@ -1731,16 +1720,16 @@ struct llama_model * llama_load_model_from_url( // Check if the file already exists locally struct stat model_file_info; - auto file_exists = (stat(path_model, &model_file_info) == 0); + auto file_exists = (stat(path, &model_file_info) == 0); // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model); + char etag_path[PATH_MAX] = {0}; + snprintf(etag_path, sizeof(etag_path), "%s.etag", path); char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model); + char last_modified_path[PATH_MAX] = {0}; + snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path); if (file_exists) { auto * f_etag = fopen(etag_path, "r"); @@ -1748,7 +1737,7 @@ struct llama_model * llama_load_model_from_url( if (!fgets(etag, sizeof(etag), f_etag)) { fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag); + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); } fclose(f_etag); } @@ -1758,7 +1747,7 @@ struct llama_model * llama_load_model_from_url( if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path, + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path, last_modified); } fclose(f_last_modified); @@ -1776,6 +1765,11 @@ struct llama_model * llama_load_model_from_url( auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + // Convert header field name to lowercase + for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { + buffer[i] = tolower(buffer[i]); + } + const char * etag_prefix = "etag: "; if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF @@ -1798,7 +1792,7 @@ struct llama_model * llama_load_model_from_url( if (res != CURLE_OK) { curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; @@ -1806,30 +1800,34 @@ struct llama_model * llama_load_model_from_url( if (http_code != 200) { // HEAD not supported, we don't know if the file has changed // force trigger downloading - file_exists = false; + force_download = true; fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } // If the ETag or the Last-Modified headers are different: trigger a new download - if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) { - char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model); + bool should_download = !file_exists + || force_download + || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) + || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); + if (should_download) { + char path_temporary[PATH_MAX] = {0}; + snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path); if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model); - if (remove(path_model) != 0) { + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); + if (remove(path) != 0) { curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path); + return false; } } // Set the output file - auto * outfile = fopen(path_model_temporary, "wb"); + auto * outfile = fopen(path_temporary, "wb"); if (!outfile) { curl_easy_cleanup(curl); - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path); + return false; } typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); @@ -1843,15 +1841,30 @@ struct llama_model * llama_load_model_from_url( // display download progress curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; + // start the download - fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - model_url, path_model, headers.etag, headers.last_modified); + fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); auto res = curl_easy_perform(curl); if (res != CURLE_OK) { fclose(outfile); curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; @@ -1860,7 +1873,7 @@ struct llama_model * llama_load_model_from_url( fclose(outfile); curl_easy_cleanup(curl); fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); - return NULL; + return false; } // Clean up @@ -1872,7 +1885,7 @@ struct llama_model * llama_load_model_from_url( if (etag_file) { fputs(headers.etag, etag_file); fclose(etag_file); - fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag); + fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); } } @@ -1882,20 +1895,118 @@ struct llama_model * llama_load_model_from_url( if (last_modified_file) { fputs(headers.last_modified, last_modified_file); fclose(last_modified_file); - fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path, + fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path, headers.last_modified); } } - if (rename(path_model_temporary, path_model) != 0) { + if (rename(path_temporary, path) != 0) { curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model); - return NULL; + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path); + return false; } } + return true; +} + +struct llama_model * llama_load_model_from_url( + const char * model_url, + const char * path_model, + const struct llama_model_params & params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + // Initialize libcurl + auto * curl = curl_easy_init(); + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + if (!llama_download_file(curl, model_url, path_model)) { + return NULL; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + curl_easy_cleanup(curl); + return NULL; + } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); + } + curl_easy_cleanup(curl); + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model file name: %s" + " n_split=%d\n", __func__, path_model, n_split); + return NULL; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model url: %s" + " n_split=%d\n", __func__, model_url, n_split); + return NULL; + } + } + + // Prepare download in parallel + std::vector> futures_download; + for (int idx = 1; idx < n_split; idx++) { + futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); + + auto * curl = curl_easy_init(); + bool res = llama_download_file(curl, split_url, split_path); + curl_easy_cleanup(curl); + + return res; + }, idx)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return NULL; + } + } + } + return llama_load_model_from_file(path_model, params); } diff --git a/common/common.h b/common/common.h index afa4cf6d7..a223eceaa 100644 --- a/common/common.h +++ b/common/common.h @@ -306,3 +306,10 @@ struct llama_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); + +// +// Split utils +// +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index f703588e1..b1af59992 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -26,10 +26,6 @@ enum split_operation : uint8_t { SPLIT_OP_MERGE, }; -static const char * const LLM_KV_SPLIT_NO = "split.no"; -static const char * const LLM_KV_SPLIT_COUNT = "split.count"; -static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; - struct split_params { split_operation operation = SPLIT_OP_SPLIT; int n_split_tensors = 128; diff --git a/examples/server/README.md b/examples/server/README.md index 8aa4eac9d..dfea2b905 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused). +- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused). +- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 27bd2dd70..b02c2546e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2208,7 +2208,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: %s)\n", params.model_url.c_str()); + printf(" model download url (default: unused)\n"); + printf(" -hfr REPO, --hf-repo REPO\n"); + printf(" Hugging Face model repository (default: unused)\n"); + printf(" -hff FILE, --hf-file FILE\n"); + printf(" Hugging Face model file (default: unused)\n"); printf(" -a ALIAS, --alias ALIAS\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); @@ -2337,6 +2341,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.model_url = argv[i]; + } else if (arg == "-hfr" || arg == "--hf-repo") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hf_repo = argv[i]; + } else if (arg == "-hff" || arg == "--hf-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hf_file = argv[i]; } else if (arg == "-a" || arg == "--alias") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index a66fed626..6cd306a2b 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -4,7 +4,8 @@ Feature: Parallel Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf And 42 as server seed And 128 as batch size And 256 KV cache size diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index a2e0e5b35..646a4e49d 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -4,8 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf - And a model file stories260K.gguf + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model file test-model.gguf And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 03f55f659..86c3339dc 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -16,7 +16,6 @@ import numpy as np import openai from behave import step from behave.api.async_step import async_run_until_complete -from huggingface_hub import hf_hub_download from prometheus_client import parser @@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port): context.model_alias = None context.model_file = None + context.model_hf_repo = None + context.model_hf_file = None context.model_url = None context.n_batch = None context.n_ubatch = None @@ -68,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port): @step('a model file {hf_file} from HF repo {hf_repo}') def step_download_hf_model(context, hf_file, hf_repo): - context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) - if context.debug: - print(f"model file: {context.model_file}") + context.model_hf_repo = hf_repo + context.model_hf_file = hf_file + context.model_file = os.path.basename(hf_file) @step('a model file {model_file}') @@ -1079,6 +1080,10 @@ def start_server_background(context): server_args.extend(['--model', context.model_file]) if context.model_url: server_args.extend(['--model-url', context.model_url]) + if context.model_hf_repo: + server_args.extend(['--hf-repo', context.model_hf_repo]) + if context.model_hf_file: + server_args.extend(['--hf-file', context.model_hf_file]) if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) if context.n_ubatch: diff --git a/llama.cpp b/llama.cpp index 4e08be18d..b385ae360 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2959,7 +2959,7 @@ struct llama_model_loader { } } - LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } n_kv = gguf_get_n_kv(meta); @@ -15140,7 +15140,7 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int // check if dest ends with postfix int size_prefix = str_split_path.size() - str_postfix.size(); if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { - snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path); + snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); return size_prefix; } From 95562175f83a49755ff6fd3bad09409417c8e6f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 23 Mar 2024 21:35:23 +0200 Subject: [PATCH 12/44] gitignore : gguf-split --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 27562f6d7..072945180 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ models-mnt /embedding /gguf /gguf-llama-simple +/gguf-split /gritlm /imatrix /infill From 94d1b3b4119209efcdd08df0dceaecbd1fe7f85c Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sat, 23 Mar 2024 18:48:02 -0400 Subject: [PATCH 13/44] use _wfopen instead of fopen on Windows (#6248) also fix missing #defines before windows.h, and BPE LF token on MSVC --- ggml.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++----- ggml.h | 8 +++++-- llama.cpp | 4 ++-- 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/ggml.c b/ggml.c index 1d5854960..54365b7ae 100644 --- a/ggml.c +++ b/ggml.c @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml-quants.h" +#include "ggml.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -43,6 +44,10 @@ #if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif #include typedef volatile LONG atomic_int; @@ -430,6 +435,57 @@ int64_t ggml_cycles_per_ms(void) { #define ggml_perf_cycles_per_ms() 0 #endif +// +// cross-platform UTF-8 file paths +// + +#ifdef _WIN32 +static wchar_t * ggml_mbstowcs(const char * mbs) { + int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0); + if (!wlen) { + errno = EINVAL; + return NULL; + } + + wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t)); + wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen); + if (!wlen) { + GGML_FREE(wbuf); + errno = EINVAL; + return NULL; + } + + return wbuf; +} +#endif + +FILE * ggml_fopen(const char * fname, const char * mode) { +#ifdef _WIN32 + FILE * file = NULL; + + // convert fname (UTF-8) + wchar_t * wfname = ggml_mbstowcs(fname); + if (wfname) { + // convert mode (ANSI) + wchar_t * wmode = GGML_MALLOC(strlen(mode) + 1); + wchar_t * wmode_p = wmode; + do { + *wmode_p++ = (wchar_t)*mode; + } while (*mode++); + + // open file + file = _wfopen(wfname, wmode); + + GGML_FREE(wfname); + GGML_FREE(wmode); + } + + return file; +#else + return fopen(fname, mode); +#endif +} + // // cache line // @@ -18739,7 +18795,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // write binary data { - FILE * fout = fopen(fname, "wb"); + FILE * fout = ggml_fopen(fname, "wb"); if (!fout) { fprintf(stderr, "%s: failed to open %s\n", __func__, fname); @@ -18877,7 +18933,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * // read file into data { - FILE * fin = fopen(fname, "rb"); + FILE * fin = ggml_fopen(fname, "rb"); if (!fin) { fprintf(stderr, "%s: failed to open %s\n", __func__, fname); return result; @@ -19213,7 +19269,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; - FILE * fp = fopen(filename, "w"); + FILE * fp = ggml_fopen(filename, "w"); GGML_ASSERT(fp); fprintf(fp, "digraph G {\n"); @@ -20531,7 +20587,7 @@ struct gguf_context * gguf_init_empty(void) { } struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { - FILE * file = fopen(fname, "rb"); + FILE * file = ggml_fopen(fname, "rb"); if (!file) { return NULL; } @@ -21486,7 +21542,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { - FILE * file = fopen(fname, "wb"); + FILE * file = ggml_fopen(fname, "wb"); if (!file) { GGML_ASSERT(false && "failed to open file for writing"); } diff --git a/ggml.h b/ggml.h index c937d4a53..0a5af7205 100644 --- a/ggml.h +++ b/ggml.h @@ -214,9 +214,10 @@ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#include -#include #include +#include +#include +#include #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 @@ -708,6 +709,9 @@ extern "C" { GGML_API void ggml_print_backtrace(void); + // accepts a UTF-8 path, even on Windows + GGML_API FILE * ggml_fopen(const char * fname, const char * mode); + GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node diff --git a/llama.cpp b/llama.cpp index b385ae360..9614cdb17 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1065,7 +1065,7 @@ struct llama_file { size_t size; llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); + fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } @@ -4006,7 +4006,7 @@ static void llm_load_vocab( } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { - const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); + const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); vocab.linefeed_id = ids[0]; } From d03224ac9840351023ff8abcf4aa0542258a53df Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Sun, 24 Mar 2024 09:44:01 +0800 Subject: [PATCH 14/44] Support build win release for SYCL (#6241) * support release win * fix value * fix value * fix value * fix error * fix error * fix format --- .github/workflows/build.yml | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bf42df8fe..0e7643bba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -800,6 +800,7 @@ jobs: windows-latest-cmake-sycl: runs-on: windows-latest + defaults: run: shell: bash @@ -808,7 +809,6 @@ jobs: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel - steps: - name: Clone id: checkout @@ -823,6 +823,32 @@ jobs: id: cmake_build run: examples/sycl/win-build-sycl.bat + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip .\build\bin\* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v3 + with: + path: | + llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + ios-xcode-build: runs-on: macos-latest From ddf65685105a39a57b1e7f80c3aa502a6313af24 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Sun, 24 Mar 2024 12:04:25 +0800 Subject: [PATCH 15/44] [SYCL] offload op (#6217) * remove no USM methods * leave the schedule to ggml_backend_sched entirely --- ggml-sycl.cpp | 293 ++++---------------------------------------------- ggml-sycl.h | 16 ++- ggml.c | 10 -- llama.cpp | 36 +++---- 4 files changed, 51 insertions(+), 304 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cc9ee0762..fc4d2964c 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -740,11 +740,7 @@ namespace dpct sycl::queue &default_queue() { -#ifdef DPCT_USM_LEVEL_NONE - return out_of_order_queue(); -#else return in_order_queue(); -#endif // DPCT_USM_LEVEL_NONE } void queues_wait_and_throw() @@ -763,11 +759,7 @@ namespace dpct sycl::queue *create_queue(bool enable_exception_handler = false) { -#ifdef DPCT_USM_LEVEL_NONE - return create_out_of_order_queue(enable_exception_handler); -#else return create_in_order_queue(enable_exception_handler); -#endif // DPCT_USM_LEVEL_NONE } sycl::queue *create_queue(sycl::context context, sycl::device device, @@ -1075,11 +1067,6 @@ namespace dpct static pointer_access_attribute get_pointer_attribute(sycl::queue &q, const void *ptr) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().is_device_ptr(ptr) - ? pointer_access_attribute::device_only - : pointer_access_attribute::host_only; -#else switch (sycl::get_pointer_type(ptr, q.get_context())) { case sycl::usm::alloc::unknown: @@ -1090,7 +1077,6 @@ namespace dpct case sycl::usm::alloc::host: return pointer_access_attribute::host_device; } -#endif } template @@ -1273,11 +1259,7 @@ namespace dpct static inline void *dpct_malloc(size_t size, sycl::queue &q) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().mem_alloc(size * sizeof(byte_t)); -#else return sycl::malloc_device(size, q.get_device(), q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) @@ -1301,25 +1283,7 @@ namespace dpct static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, valueT value, size_t size) { -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - assert(mm.is_device_ptr(dev_ptr)); - auto alloc = mm.translate_ptr(dev_ptr); - size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr; - - return q.submit([&](sycl::handler &cgh) - { - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - auto new_buffer = alloc.buffer.reinterpret( - sycl::range<1>(alloc.size / sizeof(valueT))); - sycl::accessor - acc(new_buffer, cgh, r, o); - cgh.fill(acc, value); }); -#else return q.fill(dev_ptr, value, size); -#endif // DPCT_USM_LEVEL_NONE } /** @@ -1413,72 +1377,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -1618,45 +1518,15 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); -#endif - break; + event_list.push_back(q.submit([&](sycl::handler &cgh){ + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); } @@ -1754,11 +1624,7 @@ namespace dpct { if (ptr) { -#ifdef DPCT_USM_LEVEL_NONE - detail::mem_mgr::instance().mem_free(ptr); -#else sycl::free(ptr, q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } } @@ -1766,11 +1632,7 @@ namespace dpct inline auto get_memory(const void *x) { T *new_x = reinterpret_cast(const_cast(x)); -#ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer>(new_x); -#else return new_x; -#endif } template @@ -2222,72 +2084,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -2427,34 +2225,6 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else event_list.push_back(q.submit([&](sycl::handler &cgh) { cgh.depends_on(dep_events); @@ -2464,7 +2234,6 @@ namespace dpct to_surface[get_offset(id, to_slice, to_range.get(0))] = from_surface[get_offset(id, from_slice, from_range.get(0))]; }); })); -#endif break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); @@ -2655,9 +2424,6 @@ namespace dpct void *c[], library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type) { -#ifdef DPCT_USM_LEVEL_NONE - throw std::runtime_error("this API is unsupported when USM level is none"); -#else if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { @@ -2792,7 +2558,6 @@ namespace dpct default: throw std::runtime_error("the combination of data type is unsupported"); } -#endif } /// Computes a batch of matrix-matrix product with general matrices. @@ -3131,24 +2896,9 @@ namespace dpct template typename std::enable_if::type &operator[](size_t index) { init(); - #ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer::type>( - _device_ptr) - .template get_access()[index]; - #else return _device_ptr[index]; - #endif // DPCT_USM_LEVEL_NONE } - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - return get_buffer(_device_ptr) - .template reinterpret(_range) - .template get_access::mode, - detail::memory_traits::target>(cgh); - } - #else /// Get dpct::accessor with dimension info for the device memory object /// when usm is used and dimension is greater than 1. template @@ -3156,7 +2906,6 @@ namespace dpct get_access(sycl::handler &cgh) { return dpct_accessor_t((T *)_device_ptr, _range); } - #endif // DPCT_USM_LEVEL_NONE private: device_memory(value_t *memory_ptr, size_t size) @@ -3201,15 +2950,6 @@ namespace dpct /// Default constructor device_memory() : base(1) {} - - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - auto buf = get_buffer(base::get_ptr()) - .template reinterpret(sycl::range<1>(1)); - return accessor_t(buf, cgh); - } - #endif // DPCT_USM_LEVEL_NONE }; } // namespace detail @@ -13181,7 +12921,7 @@ int get_work_group_size(int user_device_id) { return prop.get_max_work_group_size(); } -void ggml_init_sycl() try { +static void ggml_init_sycl() try { static bool initialized = false; if (!initialized) { @@ -16677,6 +16417,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + ggml_init_sycl(); if (device_index>=g_device_count or device_index<0) { printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", device_index, g_device_count-1); @@ -17046,6 +16787,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { + ggml_init_sycl(); // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -17379,6 +17121,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + GGML_UNUSED(backend); +} + + static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, /* .free = */ ggml_backend_sycl_free, @@ -17392,7 +17141,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, - /* .offload_op = */ NULL, + /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -17406,7 +17155,7 @@ static ggml_guid_t ggml_backend_sycl_guid() { } GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { - ggml_init_sycl(); // TODO: remove from ggml.c + ggml_init_sycl(); check_allow_gpu_index(device); diff --git a/ggml-sycl.h b/ggml-sycl.h index 1c9d52115..a9f776fc1 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -16,16 +16,22 @@ extern "C" { #define GGML_SYCL_MAX_DEVICES 48 #define GGML_SYCL_NAME "SYCL" -GGML_API void ggml_init_sycl(void); -GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +// backend API GGML_API ggml_backend_t ggml_backend_sycl_init(int device); + +// devide buffer GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); + GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); @@ -34,6 +40,10 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); + +// SYCL doesn't support registering host memory, keep here for reference +// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index 54365b7ae..18f10a3dc 100644 --- a/ggml.c +++ b/ggml.c @@ -291,8 +291,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #include "ggml-opencl.h" #elif defined(GGML_USE_VULKAN) #include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -#include "ggml-sycl.h" #endif // floating point type used to accumulate sums @@ -2698,8 +2696,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #elif defined(GGML_USE_VULKAN) ggml_vk_init_cpu_assist(); -#elif defined(GGML_USE_SYCL) - ggml_init_sycl(); #endif ggml_setup_op_has_task_pass(); @@ -16115,12 +16111,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #endif // GGML_USE_VULKAN -#ifdef GGML_USE_SYCL - bool skip_cpu = ggml_sycl_compute_forward(params, tensor); - if (skip_cpu) { - return; - } -#endif // GGML_USE_SYCL switch (tensor->op) { case GGML_OP_DUP: { diff --git a/llama.cpp b/llama.cpp index 9614cdb17..61587cb7a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13632,30 +13632,28 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + if (backend == nullptr) { + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { + ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU - for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - ggml_backend_t backend = ggml_backend_sycl_init(i); - if (backend == nullptr) { - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_KOMPUTE) From 586e7bc561be88e929a9afca7e67d8ead00c53bd Mon Sep 17 00:00:00 2001 From: Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Date: Sun, 24 Mar 2024 17:54:07 +0900 Subject: [PATCH 16/44] sampling : deduplicated code for probability distribution access (#6240) * sampling: remove duplicated code for probability distribution access * free original_logits * fix original_logits allocation * fixes based on review @cebtenzzre * change function name to `llama_sampling_prepare` --- common/sampling.cpp | 93 ++++++--------------------- common/sampling.h | 8 ++- examples/speculative/speculative.cpp | 3 +- retrieval | Bin 0 -> 1637080 bytes 4 files changed, 28 insertions(+), 76 deletions(-) create mode 100755 retrieval diff --git a/common/sampling.cpp b/common/sampling.cpp index 5a5450982..45d68b26c 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -168,77 +168,20 @@ static llama_token llama_sampling_sample_impl( bool is_resampling) { // Add a parameter to indicate if we are resampling const llama_sampling_params & params = ctx_sampling->params; - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; const int mirostat = params.mirostat; const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - - auto & prev = ctx_sampling->prev; - auto & cur = ctx_sampling->cur; + std::vector original_logits; + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); + if (!is_resampling) { + GGML_ASSERT(!original_logits.empty()); + } llama_token id = 0; - // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Declare original_logits at the beginning of the function scope - std::vector original_logits; - - if (!is_resampling) { - // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this. - original_logits = std::vector(logits, logits + llama_n_vocab(llama_get_model(ctx_main))); - } - - // apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - if (ctx_cfg) { - float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); - llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); - } - - cur.clear(); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - - // apply penalties - const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; - const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); - if (penalty_tokens_used_size) { - const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; - - llama_sample_repetition_penalties(ctx_main, &cur_p, - penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, - penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); - - if (!penalize_nl) { - for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { - cur_p.data[idx].logit = nl_logit; - break; - } - } - } - } - - // If we are in the resampling phase, apply grammar checks before sampling logic - if (is_resampling && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); - } - if (temp < 0.0) { // greedy sampling, with probs llama_sample_softmax(ctx_main, &cur_p); @@ -302,11 +245,13 @@ static llama_token llama_sampling_sample_impl( return id; } -static llama_token_data_array llama_sample_probability_distribution_impl( +static llama_token_data_array llama_sampling_prepare_impl( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { + const int idx, + bool apply_grammar, + std::vector * original_logits) { const llama_sampling_params & params = ctx_sampling->params; const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); @@ -315,6 +260,7 @@ static llama_token_data_array llama_sample_probability_distribution_impl( const float penalty_repeat = params.penalty_repeat; const float penalty_freq = params.penalty_freq; const float penalty_present = params.penalty_present; + const bool penalize_nl = params.penalize_nl; auto & prev = ctx_sampling->prev; @@ -323,8 +269,10 @@ static llama_token_data_array llama_sample_probability_distribution_impl( // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Declare original_logits at the beginning of the function scope - std::vector original_logits; + if (apply_grammar && original_logits != NULL) { + // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. + *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; + } // apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { @@ -364,12 +312,11 @@ static llama_token_data_array llama_sample_probability_distribution_impl( } } - // apply grammar checks - if (ctx_sampling->grammar != NULL) { + // apply grammar checks before sampling logic + if (apply_grammar && ctx_sampling->grammar != NULL) { llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } - llama_sample_softmax(ctx_main, &cur_p); return cur_p; } @@ -382,12 +329,14 @@ llama_token llama_sampling_sample( return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); } -llama_token_data_array llama_sampling_probability_distribution( +llama_token_data_array llama_sampling_prepare( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { - return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx); + const int idx, + bool apply_grammar, + std::vector * original_logits) { + return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); } void llama_sampling_accept( diff --git a/common/sampling.h b/common/sampling.h index 79a998be8..56ed991b8 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -131,12 +131,14 @@ llama_token llama_sampling_sample( struct llama_context * ctx_cfg, int idx = 0); -// returns the probability that token of given id will be sampled -llama_token_data_array llama_sampling_probability_distribution( +// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. +llama_token_data_array llama_sampling_prepare( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - int idx = 0); + int idx = 0, + bool apply_grammar = true, + std::vector * original_logits = nullptr); void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index e991b8846..8b31b678a 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -219,7 +219,8 @@ int main(int argc, char ** argv) { if (params.sparams.temp > 0) { // stochastic verification - llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL); + llama_sample_softmax(ctx_tgt, &dist_tgt); float p_tgt = 0, p_dft = 0; // GGML_ASSERT(dist_tgt.size() == dist_dft.size()); diff --git a/retrieval b/retrieval new file mode 100755 index 0000000000000000000000000000000000000000..dd31789f89c3a41fb84b88c08444eb8e4a9dbfbe GIT binary patch literal 1637080 zcmeFad3==B_4t3EnFOB62AHs8l3~%AAfm+$4P_<)4U1@zR&8xVKl4ZBAB3<-}^jE^2}tCptYahKi|Aw zH}|>E+~wSJ&pr2?bI-lc%ctJ?G)XB<@#i88l?PtjRqT}~Bb`T_pMP!es;OU}I{O;q zb;LiTaKxLg38^>amU`w)y==vT`F*|eT zya7fgNlU*v{^C?|$1@N4`O8;)XWsITSJCzPmJTqGbAiXqB57xRx>X-2Gyd}Pm&{u* zzq8Wl`fj-0tS_^VnP1Y*`cxPA3Ktiy$Y1j9Z!f=P1z<+kx3bu*Pq*5aw6nfJ)_Xh3 z&!4|!@$yatM%Pzt)wlm{b5tcAXBUnUQ{VCx%L*6YYW)^n-_?Cn4g9S*VP=rD^Un#Z zywA_Sb^iSPTUO4$<+c_1D+=aC*Z0gPW_=6inYBndx(;Xk)_BZcSg?Hg;(|rDMAvsg zewqPaKdX?Woqy4FTlFovWkpD;>oOjbt@^IBkS1y8U)TD+v-o?3i*L>^T>R}NWQ>My z>wFWw&kM6f{*>UHV1Dj3v#y(C{LXwbMl@jMw&$CbW@;)4GS~VO z7>6aDRMLHY&3(x%f7B=C?LQ&!XY2%VSGtRYulXJmKA+W3NjvLvRAoF;-?yEF7v10f z4_p1U$W78c*JsRIgH9F}E??25zTcSj+4HPF>8SbyZ5@9WofIxAxb+qp+UWX*MAawh z-o|w$8O(Y*no(-6)z4y!1CciV0%jyIO0b8?o)0z?=&rskW^tqbsD9eN7_n8~@>>=z zyXD&kf0A+S%(BITuhe8GO482ypto3fGsZz`jD~N>(`J9KwVIH$vp!zOs_*846$KW& zk?@70>XUR-eVv`*DL=oXs+jfdvFgjS`YY+4>zlV6v~~PN_g6h*_P6vHc>tzGmOq1|QHJT-^y@bbZ24%=Lffv{^HzPMKzM6Qu%Plb>|dGtg-A z^;2eTQjfE)i{KFEzzkFk=PEV9N;jN4P`w{8*XSXX+4J3j>fvM_ZWyHY-sVz?JlbQu zhc#G23-jP-BD6RgAfNNE_1ZzIKF05e11ggsvt|5umMuyEe^n^!HonY8>3CkPxL zo%_qXwiZu1w)510d9Ii4&-)VZG6`mt0O(3Fo{ipWpSrX7%}IDZ`X}`y{oY<#NJyG6 zmmt5H^IaASO^v^9)$$d$EE+$vaNe?lWvj+txs282drOuTEgv78KmV46w=64Iam)GN zw%?Bb+AYhL7cz;*2OYmtQ`h<{Ae{5G9_qhx$#)h*)YKNM{@n4MzgzX+K->ZUsIP*O z9^t?GmK6mHV^tj-Z~R1UvEZ)aZSVEYC|uD))%rh9PQ*5dHuA`+pet z{}}^&-TANSx*Cv?td=$?H774YJ*sQN%Ju!3Wy%w1li!OyYKtn)D3@oY&P`W=L+Qrz z-cjkgIwxICQfEmzKTa*(5vS%fyIooN#cJt1>hq}LSK$k3p2F4lXDO{bu~?PA?e^5n zjbC3oFZt=6aVxzkYNdD97-BE+mBcy3w-V1Mo}zuZc53|O+ABS(_Nrv9cDm*&-&~@O zX`i;wnXKt8nVOoDrF+VB&GUjM&Qq2-Ca^}}3rtl{>hbDWNqhSo8@93WU138cHemy< zMBp=FTUYB10T&@NZph*S&-&UNp5CoOZwpj0UX0oH;e{qymS5F=~wyR(*6JoONCxtZNC+`?bX9$zPM9sbh_Uw+)lN zNNC@n4xC=Cg-_?I@+P+`->Zsem)sb)RbN{?`{>v}xTx~(S6?{!+Ye^E^4AZF<~M&Z zeeI0O8K-AIu;BFcvP{2Td3tp)OdZCU#;R+v)Rk8{yII@5OzLZC`{}Di3!eO7$BKg= zlnpsz)pu3Q`fj>0E=>W4zLgP&~=21!|E!YEQ4Oua@%KC+@AK32KWzW@epFQy2Ltudm8!b8CUNxgPbt z+L>AIjW3=(lK!uyT`6Pd&xlt`8y_@r*1k{h#?x)%sHxy?d zbt|*JIo|T8>HF!`Ibq6}_{r4{oV?x@{_q-i`1I;H`lFQ(ObWEEp#81CA6fpPTZIi+ z=8P==HDz~eYS(gpTmCg~`F7XdN!1B~w(zjXwi?~3HPw=r-%fnmzMDGC_WO@2f5>Qm z3hm{FPhJlkxtaloJ~|lot<@_%zMQbW)>En0CR9p)^@pwZX@NGydwG|l0(CDJr~~?= z!EnOEx#47fOB@|&YyYNtEW|kU1OFF*%l;2#l)n`0SNRfiy&pJvFJ2w!KP1rh?!^9; z*oD%%<}0p~e3GZ%x_*`KdS0rirOajmG<-mxm{3{j^;B;6CK_-UH$~& zwOXa_DXz=(G*raOLqAWe*9+EZp1|+C4`=rCzFzPxHKf4co4Vj)O?9O`7cvV&tx`v6 z+qT)w8QYiXWA;`cZWuf@PiVi%U9N7NYveJ~1<8@=ra>KP{n|iXM!dGP3Er#E3Dixa zyr18F$$FV$&PY4vJ9or8d!E|mODL}>Z_mSvK6Y=_p(z>1Ea+xBW2*b1L2HfBp$X>O zu>xpIXKu(d^LUQl-+V^moTEd7^4{iSi17xG_{{mwq|T_e?6wyHN3q?mRgb)zYrnH# z_mNhAEH(J8jhl>{)Y2UK)O?I_>Sv78FxJdVs&#l$31jf*xqEDUoj&E>s=j3PZ**=r zL-VxQyhvy6c)+Fbz_H-62wY~3OQ@U%E+t=qi*<7Z^A((inD5h}Gu`uig{*fag44`E zTLN%SW3Een8EX%7B!T|qkzevfY*Mf?|jGF(`J;n_@<1jU&fYv z_14(>1xKWV*7*7bkMcWp3!eRgd(zP`wC{5tXc^mg(a}-*qED81!l!Oc2)E`Z!sC+Q zb;cz50U`_2}-0MwMSb zwRHA_M}N6&<)J%gxBlXn%RK$hOHb&3e)^A@I};CIP=5Q(m9sDZ_&3Wc6V;a0q<@m6 zw)}+nfn>F1`JoHSi(TiY>&P6{w_o&RqQ{@!NLqO^(>3?G>6*uz?)B*zO82I(07m7@ zN}tPbvhM%z)cS*&MVG7zkWcde0=VS&z)*hIeMX+S{H}uD^ljE}N`9kkvGG1L{m1g2 z^xOrPtl8Q}m2b-l94q;_eU9DNHT-Trui`lXUeQv@wZ3Ju3mv~LA6cy^xR9RRTC2qxK?`p$n)O`9&Ms$8o^6g<(lZx)I@unKI$tpUi@mF zjHA}S{4KX$GeZkZWXvw|$E)(`C0hApO*>Y=Sjkvi$XerH8@PWgZO@>dgrwrxK6vp7 zSD+0ocvqTNl{fYYv=#HLlc)V)d@E|%l18_yw$ZKCvLlf;&{aJ9giE#AdH1reR)1NQ z4}_0L=Wlcmtii6ZwV66awrWPUI+_!h*yIk@7JuoE?8d|#;tkn%eCf{YAGlPwG)GO` zcqL)Fn%I1BYHMkhYAg9tNp_=4?aD|Bd{EDv1ukf@c~A4D%mMoChkpIguRU)Y-Dx$a zSI>ghuZHGlLi;nA1JlEwESVNQIx25b%h;@k9r1xs-;zq_mYb^d|4E`D)OKw}1J17rZOO&_mwWs|Fb9 z?RMImYNY4d>8dm%ooT1NgUs^jcKN|(c{}YLYL>UtRm05kcG^3_EPu9L{%o_no%Uv! z;W_dg9z1%E6 z#4dk@S>8^2Cz<8#bk@}xr|=P<{O@gHoZt2DE`o%XuT@^-o^&Ma@Iy?xB`gY5Es z&GL5I>oLpQ>8b>?yq)$YndJxC<&(|wcG}y|EN`c)OnS|<)84bp@@aN?lU}#mY41R@ zyq&Hx={3_%dxx0iQ|7SD-r;6>J6&bcYo?v{rkmx@vdf$Fy4_BDy=HkkU1id1 zrk(bVHp>sN%bWDN-A;SYHOt%SDwAF_?X)+`EZ^TQZ_?{_JMA53mbcSYCcSzo-`ux& z_BwbtJoMEy#6I}O?ZkP+D~UtI-z6?3#_pI_O}vb_f%rD!W?~HUX}VTCdm*upcmZ)9 zF*d`r5b?K(ONp^Zr&SYUpGa#Uo=eBM=&dBh>&D~U^qrw~^Y=MXm#2Z)=Azf7#h70=El_7PuB zoJWi;GA%@W32`a$Ma0#_m=Mz%h%X>+CO)57M;AVi*hkFzmX=5CBMuQ`pG+$y9!*?L zJd(J9IFq=UIGtGUQ#@O2B0l0_#CgPGzX=f!A}%FPBd#Vsi@1TfKXEfLCeJjTam0R_ z<|9rZ&Lh@|L&SZEONrx%tBGC24a6SSdeLXmOJ4CKD@-o*T^3?34k0VdWqn@v)i`vI zV%Cjj+z^I zr4>)I^TvEXB`{Hb*QO{VR&l28#O92lLnR^uzd*e@?+b?p>cYTUg#5mvy?wXczQn$G zU^8{5S43wj0**v2v239hsJj>ad^0-KQ|RQ)$g-O$zlb3E^x~m`ww)KIR0_^EBmc{L zdAEOUa^+6XixrFNg5f3gx#6V)18oIk0&N%eQ3qb4oRqCy+plu}#r-SCU7}U))RM|F zR8os7-ds_n<+ttt=7qrf;>YbB@EW+HKMjIM`jf!#HS|ZP&1y$~bitvf8#wl<{MOHs zAJ>al;|Uu5Y2W8M@X3;RWS>6BKz)&gbmRvQ@=XHrzpMPDJHF;4cU;YfZgLZwRQY+lf7hK@GoJVY!eb@t z!c`mBhqKp>s7zXuR5^Z4zshUZ^slTfQROcUK+i>n-Mn^0<=fa1MyyM!%-0fI8X3<` zDW0-vZqExd?h3T7;dkWTV^!omlB(LSM{eBc9#rZ-e2tAIS~BHz?5d z&N;}8-%^h?pkEE3Kl;c4*ct+D?_(?K&p00&7HE@q?3AU2t_)1fOHmgM2e%^#GDkbh z6xT@^$$!DR{*`({LxtFPgifYQ0N&^QiIs2plPmKWSCLcE0p>_O&G#`+z^};0AG56Me1X1Yx~8@$DjKCO?A z3x@^;>hyRuAT&r}6PXUZPJ?dqpx>*Y<13Njry^s!4B379^0oIDX~3Bkow&kuw$p)r^;i&e&j*Tfd39h z{eSDR{@gHi8smDO1%ITjqrs1@@%c#_WAqx)1_#t7BsqrGMe z_d(WrC;p$L{nF`yiKXb5#b@oERD8!B*`-!>Jb}-QP&;nSWZGzUF?5!SJ!_z>Y9O^I|O(dfuRXq`VesZk};9C zhx9e)6UP0HwqHQ5wA*|-3jR4`)XMJKEOykfqn}krjym>q6x?&Bsy*G+fxlT=c=4M1 zwYa!h!i#pM1nz%3QSEvGp76Fi>4p6X1IylJAM8EWnI`s&n&R)+`BKp2NAJ3mYPPRB z|4I1tWBb4-dP2yYL718MJl>eu>AuGRY7vGctWv(1`M+jqU^POjOzYWy132&3Ki;UCU6ic@-T zBePFu8_G4gR=K*i@rPC8S9WOw{OkZ<#VK)}{QyU?`hn9=M?X5+uyAMhgEsa!+c5E- zSo01zF9F{r;Jel;W8+tF+XQZ9ojbicm-&^$JPX3(pD|?3k59l$;V(x=X^Wh5PY-0K zi+u7~cz~>GlUh@PH6JU@kmhE<7Lv4+y~nR4_B$hUe(0;3DBiXTk>rCJ#J7 zhX?rK0e*Nun#BVK+B{$p`>9bpU@|;FhX*90m;1=Sjrrf)F@FX!|I@n6|J{!H|2VW~ z*U`fKm$`4$!`y$UV-8vMM9qC?J=-1i>|pK>QkSJ)X3YPA%>O~m|FoK4&SnqIG5;TM zlu1Od)hQ!(#ci~Gh(5?%6#n%#Wkc`)_N@#aCNv=R`?!Us0N2g)oyu+!oLkMv24Ie$J6ACUf*(Z*)lP|2=Y zwX}hZ{KBuSHUxjdpWZWgl&3}bfbf}P@RUsWKqh>kKVwh@Zg;?I-gl?eG_Sf~CGu?mdGQb z_m0J%ME194u(w)>4rA;aA)hB9hj?gr9Bme%qwVm#P_YfYsf002WM3*7eRL;0YrU4# z(uAy^#J*HA^7syTnB)ucPUM|x=D+R^&-4n=A4R`dJlz zA3Ecp{#@`Ro?Y-#)3-p{_pJ#$kXRiIkEPxP;46-P=T*g3y6Jaldt#+4`MHYVcF%Ar zBX*Ju4{`=DTJ^0_sEZj0ZCI%K}g!UYj{Asx}L%)+-i`oePu{N(F^`EL&R-b82rzodZ%Ck;I2j!6Uc z(7<+uJZs?E`=mVvpOpr@77Y|Y15%$M*P3;%vFmiwLHu*x4%|8EApXzZ4m=8t+H}DC z4xIi3oW{Xx-X(s7@HF8yLNpzeejYkFJFW{INPjX5vpR5N_m6iSeH&Al)zKfGJNm$L z2i$KvVeEw$czfUlg5zjjAp305yufSmg1o*CUa&DPCLIK!6>IKx^z*aQz&RETXuxjO zW%!&q$G?L|JXesd9@hJ*hx_+auY14Mkw^H5%`5&~u%^rJ&_qE8&wy_^%7(%30O;al z;wK0%6Alxi=>mJjXXg(my3&R42ODmKH?Y_3NXP7-kuR#xQjfDAC!8?5YK@UR~}u* zaq94bzxg?;T;#qZ=!s&}o_Px0?T)zeW^8i>?3H_UUwI=ov5?qlhbVM7Vr)R6k;IwA z$bR~0)n#u!dLLbU48Mb&SlS(gUEU2niND|);%Zl*?GG-!teE!2R`k1T z=*y?=7n{BYk$WZLXA$dtbi2~t31rtxW(L}v{k83d{pqjRyu_xZU#sd~1D?a^Qz8$G zUxia9jwf&NOP+A7c4dF0yWh?f_wTfI#eK;c;NFHk<3-tbh#Yq}{Sy6Jbp2>MTQhqS zBKsF61zV>hKaNFSEc$su<=8~E`#5rs*xtr6ZZipsl3%Q-txc-DIzjEe_bl`P!mpBF zs(5dAO69LSYWMfi18yT6OMa!|!vE=CIe>S9cb{8bQi9H4a0l?yiK2l!$+!1aPx#>1@U=y6 zw{00>Td{4cvFiV>RloG@0Jf-e;A8y>qiFBPq+{7Z^B9v8j6pah*ebe)7oFc87awCX zjgWv$KfTIRIbBz~(VN?f2s4rQH-{4|clK4gH!~)pqurOhzv89Gk}FeqCwRY)G5J0| zn(yJG`QDh9D$wmOeFIyGm-Xa6@VFR!E+$R^pVD>-_>{e%An`UWxkV#3ZI^CijvNM0 zHZHHAuj2P+<5Kcz^i$@NGv9^eYXq0a4O|Y6z@=@c>aLBERvUs(X{Q-lcn3W81E+1r zks$~jw4bXTXwBEKf4R^VJN=#ozarmO;ZrAg%>$p3_L2^PM|rNt#>0Mz@m#>O%n2Xs zyYLcgFZ5B_3-$Xvm3~_%WSllWlpEfZ(&^V&rSlHjl`*pY8a3X@9C6ZbwOj4Fl>U^$ zmu=ennOj|Sn7u)J?i_B7J$Jq`X2037YptNGJgg~ zxA#f=#nPTVp8spLXP3SH({|t)6q83jN#4l$(dql$n$H}u{J&pjeCh~7|36~f`Vwox zr*dUK)y+Os9Jb1M#;gxx))&5}IjBdV}{pQK&;5(DeaXgQ4^fQjh==SGR=Sz&^xQWS?k1&qg;2FUr<^p&u zqx??v`XctK-w?eX`X~dh&(qF&c#TGS6ZjSXRN zjjw5AKUl{7UoLftjIlj8?C~uEhaU@Fbn?b6vJQWWU6nnt*}aXi?4>$ss)2obHZNy8 z+p@>@TC1&7|8n_T-*Ed+o~!DHlfN;yK6`HNUD?g&0{^Y5u9Whj>r`Df=N1mThsPZI zNz^5N5JFdT4|>AOuUBo&gAM<|EzZ3(FS1}Gb(~(E$Jk!Q7+=X)Pi4%fu=mIwRj#@B zD1AK$ZPY?1HxPbEC?+f*{9|U^)^quN1;2O9^D|iEWNoN-@KHZ=emX(kYcg+{p9=_* zE|EFQ++RR2ez)dpcWc_|kGjlDW8AhdZi18E*0gH+BWs$>N$2>{-mX|{N|V)|l#@B# z-J10$@2lj?rAl7>QirWsDN7aT2-Wv&;<{r$XdEKQf*zXV~+@{%sm^ojphFcVT1? zd!E?uJiHSg*WFzCQa9uD>?db7SDLMHI@7r_$ZAi@^)wDipK}~OJ!#H;d#?Q919PrC z#29>5ouq}jjD2|g_ypFTbdpB$cBgj;EZzNfraDOj^*i)IgP+~R7=?P#2XzbfSUSmP z#m6r#eE7})A3ymIFg^k?@bQO_d#4W@_E`9c zrK?8k@jm9U?SnFsXIq{bVc{lZ;pUPa^!WkgwdIx0dD3-lY~LWdvm053^OCFMIj5?X z+weroqGD&TVaqJ8&u(;=WcxTn)Qq2;=nUd-`VeZU`b3;##Yv2YyS59xy~v)Xw7A7HDt z`XFOkz2RHpLlq(S)f+xkraoZn0o#tbw-yOp_-Q$Pk$iFvOY|>fMFSRaDCr>Skm&g? z^nLUzn+Br&kSc(u!2wUSU1SIC{(un8iw>g)hOmeD(G5kn&O^7>`V!c1%i3+dz4^wqW_ zJOWM@)5mM*>nq7GSM>jFze@I|JNhX$gL-WKVlyZq7Mnqk_zkUJi$;7B`@#uqe4>*? z_eF3dH0;FFIToIDeA7GE#MpQm{yFh#n#FHM1EWTkHs?z7!T8ybtQeCO5dlTkHI(g znS4X+1Fs{;oDF`8r^R)dGd3^m&c^y1+86pSZ>@~n(+Zt>b4r5>I&CX!pYi5E! z2)Q>Q#<=_xoSf0PSn$Uhm(Plq=HrgJ zA<`aHe>%;Qu1<@{kiR$-U#{hyP)`tc!t{v(6MR7W`iXUT$*Y zWkBzEx%NMdm&;<{6h?$xQI~q{rq{rYD)iw(&mwZdZM$w{btQnpWQosWRw3tNgt!BKYASY zV6!_TeqzBNi#|TCjh%yZ;Trfk;c}t> zYVhq?I-RT?53!GHuN`8uu-A>Y_q)!?d&OSbi*A=dTVmtVv2h*7MiT9tf0%j$=%}(j z6f&-|KE$HY+um_+{W_sLxV?hTd>9LCtAwb!8D*5RK6o6w_eWzJ+B&iIUDa%<$67=8X28&_|1Bx7R39g9B1 z797hj#hzZs`*gD=c1F{#l%2x_~|0>Ac%?er{Ooj=E)c+yqbIyvyzd#0i$&aZ?JmNWxpc zQsL%bcG?@G?fbufuhZt(3~qa~IsTUX_L?O&M^E&5>NBu8iqAnc<8?n_-W2YTnxpE( zpSIoYI&kV%H#W*RY?blYEV0q9#zzQyWi$>$jH#S2DdF7PNqmPw;7#@ss<<1V0el&I z3OpajFQ)-Kgt2>zO_v)}=ZIZ@GIJEVcBC*$@;9u4JFyKuhQrq z(%}ze)AOz1QvB(J$F;qd;MX~G-OO4(%&_?mw)P5!M9j0VQ${C$<6k_SvmclxK6aaK z54CP~VPD3->1x7y@#BLw?gf@v1o4v;KfikikS0i4{QNc#Bux;Xs1p1q7t_!6;vdDc z_!}<9xA2A9}fw$h)ZCM|j z{*VdG^Jss_67X1vKjZ@DSI0e%&|M+=@dD___J@@7EV90femq~+8kf4rDNp^J{>fU? zW&IahZ7(tv-?d1Sy6ic1zg3s4@v=@%1HWPPb5qZ@*RIGtsJkd*ukYJC_Tf5h|F)jJ z8QguBFp~Bjhekh(Z(}q)iXUTWbbWXMdrdz43w3-B3*k}59YelgSnxTWbJ2d@%b94Q zTh1xM>mvOP?KM4GKVe=PzJVpio}20KJ_uWgz5eCVXQwZsy$1dj`&e&lpvbY}!{@B~ zYp2^Uw#VQ?t8S<7;cEPiqkRuUj7@L8hv|;7arz#{UUz!;J-mSNn?iUd$MiESXB_{@ z`Y~!tZg>pmH)Tw@n=|5__#@t9yo;+Doi-&bc{VuG^80$y-N{`(%a7G>`LW7a8+)|) zu@=dGBV#RNynykS@s_a`AI0v*Hq_VLTNK?S+NSmDJ0@>giEP6<)nnIg2;7%9skqp+ZK{a0$y0CcFrTpHVpYY?fFZ9uDw|(S%qx@87dCq>w znrHJgQ}>N6C(0h>@4@ZcEwSV~t!5N@brb8hlLwRoLj*4YhA~$Avd1hkGpC)poNejJ zme}3?)^};Ei|$fAzT885)!<9o>&eFVZOYrcr&G7?%6r5=b2MQLK?A?h{vJ;LV)%Q+ zpt-^|F6bW(pI#Y4&JkHd;b0*F>qQR1E*fzMboIw7X`0DCtmw$VYvrS=ye`^%EoP5OP4ur(Cg+i#O+baJLkBi zD|JJ!UH$$330_|zJn#kLwx9>xmc+nqQw-b&d%&%|=0xB(YR$3nTh$GIFFHf~mj9dh z-2-kPB3xnMxAR`QBgl!7chf1(vpc>~5@QW}6B~kU2R*`>UR#ckzTOI)QL@9MV)s0S zZOYJJxEs4PC%(0rb)yu!Ku>bS97msfk|Tadzhtd--VJr!y2mnNKEmUi_vjrFJBplH zU6)pwy4@X?GiR~*Y?ZO@oAQgTHyxea9bfIkS=4yWOvTB1)yWH_?S7H>>>WYx;Vkik zX)n6Np>+ef?=W`zPPxPRU6KZYAtoQ{3k-c~c8}s)9*#BRDEuf|COX2|IrIPnhJn() zY5$6tw`TiY=WMmnzMO%J!`=~(-J=iokG|MJIRCa9KP%2gMc^PqwdMEcdo+e!%3ZUb z=XpGT2(GsAJd$UzZ9T>FA?#A(50|e77H#MG4Od!Gu@R>h%_V+{bijyHi!zC~xl)T1 z@kf>|>vC+bVk14=;XgLmILrIgTkQ|Fp_hM*PJpf`x`aWWv!TysOP6Ry_LB36&B$4D z9=-X36dMx*Glo7vuNly5@!m)c6@;t8a zygf(fb0_8Hx#4l-JD)twSyRG3e$V2!#(Q!WDZW+0{fjkolS;BPd2T8TE|Tw;s5q@Q zAhFzQ$X&xCdt*!6<#mgm6lq5*rEfM5&V0keSZIFcguija-t0^{AD7WphA1)YS3|L1 zO=H~hD03C#cqQXGmGf;=@W(c7Sl?%kts=}PTupfCsC(-n!bw5`>2$)E2&1fd5!+TD z`+U32BX0nH;T_*&Y_K7Aj7mw4hY-s+ zGrm3xpM$_>i9_$Sb^ePS`0V6Av30s0Y<8VIfcYoB;?pd3eTBx(|&_WXP@iZ8i*E z#5NnbuNY~YO^?D$PkV~bfCI6U)HBXsBpyyT$B_4t)3$sLJiKMq<;25tXMl&lQO-#p zJ>lU&2ObpP2-1$I@hVgucr?w#T6@ zxd%ed*(`)-IPEdp#2&+5-45i0Fzd~E*1dT_-t9zYsD&Tf_vXnSp6mnrxUOq=&|cfcF$m&x4_+nDDP%iS9CEO$i6x&JESEbiWz_VdKbX`G==Wp6=j zN|S}Z!f);UM6WzkR^HdcXA6Oqd;Bjh1TL{vacA$GC5(;k%}n3S?|a~zV*8YJ=T2kZ z4K(Dg0>)bQ1*=)xEj#KK(F;Dr7FTV_UDdKavo2R#a#yt_cQK~pobzs$RhPWq3l5tT z;me{^Ky$L!c$sk~!rp6)+|!@zpt;7^JmC*tN(fsr{}!ihrQYC6vac6;&gAH*9KID~ z^50+4?;jIxB-}w*NI0C$x1NX-3)sN`2`(`E3C!r^uZ%K%J zsKK}0F?_QW`6BJpXXcyr!M{4!HEBcS2Jy`jyHad9FLob7j`Kqs;LV+Oe)DX{>sC8r zuXoZ&5i}w)ODS~H(>*&jY(1SXdWtq8aS&UcleJZ3m-Y>vx^lZ)P1IHC?9zSJ%aq19 z#u!@{e8^24M;uS+BZ0EApThSxB6aC~?gQ^#5x?l234QqHRPWzoi{Mx1a(9TeZkHH( z5_iDZ@)2oU=Ba`=*fNjs2}5oopUhXW1vSA7qW9b6o{|Q5QkbxbP(+wX7)MAU$a-I$ z8<_YL=C+KR$jUOmL&p3z$F2~(M(w>B^ZOHQZZdBA5M|ttB=ft}n%||?{N^Ln9c$|K zRz2-4E+vEF;d+Xf3L9f<+9KMaQ?*h4;Htlt&V?U1XB%nJ-@PweDE6ILb&)*Vz zXTp_>-n(LI$2Y~=n&AtC{e<5T{&I!Lz@2m@eihLBXQHh|=$3NR2Z2yFW_R=!FJP zubv5xW`L{d;A|SYdLG}gy9!)h2~MX%ds7y*=bGQM+XubX5*{R!5Q+$w8}w(&N-=bW zPJD~aD!%nn(GkR#t(r5$Iy7K^AEp2t&+o9Wn0NBs3O*dNwZQ{E#*P=VctFTn<3kn? z2w6Pf#6iv9JXrm%d0(Ja53rUG1}8VEx{!?%_T*x%(_Yr;)2lmm2V;G<^rJ`U>yHSZ zUT)gzL^gDuCy$^l(KqQM{yw{H8BO}_Wi5^FS>nU5n_mxQOw9n>!L-ck5! z$l1^)-oDw&N(RaQs$xUC-B?r_`Zje2e1zQg>NL4(x%9$vZn0?SNDLIQsh^WVdb6N ze^$zR)xaGvA>vZvs^NjUCsG4-f58_`>_Zy~&)=SLonD_+wqSkW0lwRCKztRBy8ShL z`@77;I~~78@d1(cgz$<z^x7Xk&jT_)ALy9Lg*mPy{kQnP&fiY)#{+3QU_|ysGp|z>R zvFXj+2QX;Ou%D=7`p-)%;!g(;8tX(1+FQ$7&>ii)#u=sVXzyiybJE_|&w%y@8?-0$ zBo2Prc|P7rPt_aFZnf8t63ZSZcOqn=t}HzrHr5aUcmFsebmKI#i^x+ z9cdPH2LBLx%OgGk9UHbs=%Y~X0>@4)-)PC( z_t->u%svgVoS9>M@5+T=iyPkzxeHfdPP1Sxg$K*N z?#7gyu&d+uw3?)jxghxm)ja5c`C(}DE`rqcNJrmNYh1t}cBGpLQwjHjXPLKySo5?F zeK9q{=V-+d6HZ$n!xr$q7hjpa*pY2tnMnIzWciyZZ_Cx&@t-&CJCW;y;Un#kT}Sl~ z)QuhzsI%nEI$MSme>CoL>sIC!e>C}Kn$3SE^X&Wi#)>{RF9PqL$((A$7kG&V(wpx5Bf*+%aH%z?{Y7_rj7imt3Thuj&o%ckiJ9i;=jzBu7>a% zda&|&)45;zr{bSu-o^;7l zz}c0AFA3gqBkx{{#Fu&ZQnhvWlE}y+FY5Ip%l!4D$`+iT6P{i_+PD`cR6nNdt)F%M z_S-k+i+D zmuJ9dp34z`z-U~?g70GBJ9MecIqg8pSPk9QwaETfTxMow`a9@vdFW7nLINT4GUgI< zM_0a|HsebndrLwKBOSDG7O<3BxJr09VH|qV@8m9L(Fp$Q`_$ zcVWj__KqdsNaRpCFQoGw0R5%}|3u?#&k*ZuPo%wlKKX4KK>f)RPJ1~4IbaAjv`D+_ zsav0|78)K(yN9s5z6c)vo6sE|WQ~o+Llg44>;sAqNHiXF?geWG2hTb15Mr$pe-rUP zk^VH(cOU2V8UzR6qMA4wHzCovVw`*O(@*g$@d-{MZ19mj8X4ph-;7KCln}n?Fxf#$Z3c zEg`haJ!+)iz(?6jd>=aQ8^F+n-gF;r$a~SJC@Bbh)5FRJ|gm5?EHp15l zQwXtiXOSgEhpfVmDdS=5p!MKW=BuG!8n(;MbAW{zah*1 zSkS&`Ej4&&SN{_6#q}YpB;a@d4svuY`@MX;ycE&Pix>OP3+6Ani;e=llD?*@B0_0 zRK9KO?ebkUzNwgmOq+r&@CC}eXYB31Z0zl7{ML^)5@|#BcH5b61$Ns}z!?dwJq?Q(t{u%b-hoRwk+Pw3kcsUl8Bf0FNfKPA7D7G6|9JX!H=SK@qy zUPt^0{doi4*2J881o(wF$$Y!ddS4*V)_X6{=U>BEvZkgGzl8qx?kw(}Bz}o_7V#s* ze4AqTeZ(&@R~F0f#6|F{BF4A~{$~4c`+-sCn|Edn6`e(d}$^;KsZ2ng79;~?Sz{Nvj~?FMiP<; zCuLs6mbG57^<$B>Y@Q}KF?2@8ry2X0$P^8RUJ0K=4qJe(FgCe?Xpe_a@ixK_wUv#3pG#K#XN_#BtxgTM4s0T8qRk{kyl;e zSo&oy_&V`d{2H!q5*Gdb{s z&LsG7%eNLRy8$#HvPTHIng(7(<_-~yy~t@7nCFnc`_Ui8TGe2{9if{_>AbcN${EXxKrUh){OJBlPX_?--s+!0v{It zpoPfdx3Q*+@6Ai_^IFeK6&bGNmJ)dFWAMS+6yz%S@MKqt;WP9oJn#|PKCpHG{Cq%* z_za0WUJl>e0^d`#r@pKzWq)R}%hiI_V9p)z+6&j|l?pz4VZz>ueE4=1e72nWGuOJP zTeWye7gP6Fq+a&$6Re1>5GvLwU3ows5)f(McgZk+&QzyKU_e-og z+cjnI+_}_wjjTQLJ3RP8z7ui*LC)4|q;=%44DccRHxIs+f$Uep^Ju=QWcvdh0Z-WT zUi}ty;PeH8SHB{@K*z9oiCkL^yv@VVn}DsEJIcf#sF^#;s)3=IJIc_{UQv^qqBWZAd@tb_7oO z-p`HDZ!TdRVK6}>w7@(5NO+16CWw7jDbbi!c^fS~O-S->ldvANu(Xhw5kES;sbXaUTjFb2c zPP2HC%p2R*AwIaaA9w-t#JQi@gbX5bzWCi4JdCuA_s!jWpKKHT6yD*m(;M^YY-^lL z8Rwq-rlh|OjB|*w65lTNha-1bz!2)<+P-hxJTi_^>`&ai2Z0_%c|qf1Y__&j+C& znKO4WnQeZRviz$x<}n$G82<0N!m&AL$vyw%os_^Pe%@Kwv*mcUyLomT^I zHFO^R-5mCA3_pL{&nKF=`5?z`00N$d9c=g76>AZhyGKfV58r^c zFNryhrf;R-N_d{&N$id`j_#qYX#5Np`>{2af*TpjQfn+rqsG$ikL`nG-**o^8D|b; zKTd-`bZ2X_Wi64NWQ~{efp;;cRq&3V8Dl!wu#?)dz&C(b#=hF}`Lx%lcd>KXvVice zYT%Y{D~c@8V95dv)|d+KYG53McV%=Z4=}cu0lUa*=Jzs9IBY&P(HdJPACo#|Z1E>{ z@Uh5pV;y6=7k}bE;7=@c)1AC9#2VW_h)xxAZ8v3A!Jptx##r!XjyUL!)&YADQua0UK%Z8rPv`(U_pVOf5Ls84y6idL z$(uXv`|^zf@f#4I0kLzhWBfyeG0?*=nd{=0Wz4G{@!h#EY4c67Id5~|d_S^8pE&b7 zrW=et54Gl->8HCMIAxq1JmOK2z0b06Ztw~AG)jer*b_GR1pY_{pEy^wIq@|x*TlKv zY@zM}N7$lAM#&C3KDu&;SSQ|0cW;1^`f zg?8q`IAp)Rj7ujU92UU`hXT8drDI+j7frbA@j3oi=*3aDf{zNE(c>fS$Q-%C z7@wgQkBzLWk-F?TlF44sMaEu`?0-b=( zyIX_Ac5BPrRoJ=hx#E<$o3Lp+W$pq=J7n%A>>h@!AvO+cUrcw%!D8pIZFC|Nk0fp5 z>=*P?))5)EJn$mxQcq_RD#(APGH?_9ejOZiJvU;0AK9sIX!NOLU(lg%1nFBfwug-b z(J$(`GpCxq%O0VmgQUf#Bl`-PJY%~s_A1!B4oMl=Z;)rn&vO$tGNEf(qih`9LZ3nw zO?0j^k>^cbp&!2`AX`L!PiZT@zgqg2d|UC~pNUTm`w61+dL25R-Odc!nL7)a)G0sQ zkXZf?>}1ih%X75#&V(n}GdK6;oxZ?ABeE~=A;=n#fM3g4LMHw!p^r;u`#jalk~s6B z^z+i^4$Vye>IHFIll83h%wT5vY@R*7%=GO;{prbpG3m&Q2Q-xT@O5P!PbJK8w6 zRyEC=UfM@}#iijplQ}Xy@M30qflH0EWd?g+|D6fXE|j&R8F;4Ydn^1K6+U8~nyi&` zF?*?FQwHRmYGkcVh*RY=JPj3d!`NW3VQcN}bD+KZrEDJMJd}Hja${F6XkE|UHO+&! z4O>t7o2~MB_^Bp%_VQgNRhcjJ@M-%Tow01>ZUCv{^y;VZ#pnC&p978`g7bXZ+zy;? zF^@*B%xpcqy5899%>+jK`FrHE+p%G8Hu@Th=qu}q0c#$x3av_C@2Bj@mA+Pi*RC&m ze$by+6F47MsB?!-Q~&ZAnqF3fZ`gUOidwf}bt-=?$(|EU}OKCe*1s<0?Y3=R11KcmB7M@@3W!wkSrtpR) z@-3vC?cZn9<;m;!8F+B|LNK1Y#5Z9t`RCH6obmYZnx~C?cHId=Q`GJJP3pcASj0}> zlh5YI?7y61trFi4p?Q%rg;(KU3Y_kmNAL|7`BnT74*|FM%(d}uy<08a&z#@$&wyXy zqwyo)3#6%CBIAFIpQ`R2o~Ua>6BV|2^m5}YP%Cvd_EF&x@2@?WExtXz`QGk?+#nFx;6K@0DZH^VQ|eTkJ=izPPOG;ANl@ld5@fv`;Ex= zgXR42Z=yD4Ic9)|0W=>t)Sf&tBg?_N*5C zLismd&OPOqt2%uA>+<=={0w-22fCULO~^OTiV6!`jqjcn`dj^8RWAH82*20}{FANx z3FIG3{>{L>xo|=2BI2EeS*_E_zaIV~`8TnqKF|DKKn(xhDe*@eVqR$1A% zkh0a36W+r4!nECeRQXdby~PEb8`1wpuKG^vy!Q58bMMXx-~TCkKmGd%nIW3zoCV&_ zAzV(FMQC@&!8a0+a}(J+PvUG*GJGcm-LM~b6}!vvt9rC!ug^WK)o0*RExp&H8GGI0 z?=16rA7=tnE!dU=+xn+{Em_Qsjo{$oRm)q$vyjbzuTu_=l;N-cGv5tlJuju-YSn~R znO}0&wAfW#S4MvfUk2VGcPuS)YwUrmnl9}LE+{{e_5>GVzZXC0S%MGBo}@mbyr#`* zyA;`ZylOk)*0ThkPxVp9PPqHmj61F7JlaPckoh>|#@kwlv$uN%bq!@+>enHkuospw zPFvdWM>S^{bFzqarD$TG@{LOMU$82)((g$q%hWPn^HQ$l4{DcB|5Js2*I>?;=rzyZ zUTCdp!Bg8;s5x>@B+K(B&SDR)oD3~$jB%A)rAgU>)!mm3tcqRs_g2}aHQkrpdwcA% z9H=+iZ7S}e?7G-xKd{OM@9w^A=AE(2o*k#sCeYts-=lVY0{$B>4YY~R{~^w7%3jC- z^dLF^5lR4$`=jxA-Ko9Z)H~9u_jswZ-r>|c%&7O4Ug}kM$A?K;&l9@5rOqpiWc;H~lU*!3RNRN6@Tap(z=bNb`M$9Qgx9?zUp zYJKeT@NomK<8{$+rPhf5W>2^cToSv^M}SFid);rk*7^5d>fAmdcAY=A>iqStu65qm zOP%V%*md4%)tUOZS*N-_@H=y!m^Akr&N27c|MO$lyO?@?^gr}7=NvKT-ru=1KVu4eoKM8@#Gz}+DD zz@3L3Wuy2g;m=^-KO20W`ZM`<1NH4!(Rt?w=@P8}9|4s5wH|l@BLTIVre<_Fm+k$@R{rD8fdRYH^@}J8)=x>_gTY#Sy z@48!4RX4onMQG-7@Hgy7YL`JPz$fG50lv4ugWUg;M69~3;~P4se7KU z5SU6P0+R)&4O7!)!D4B zy~PPn5$6RL0Z&=t{)&aik}H$;;bWPwzv90A_?bM7Pno{I;^p$>%J1|1E1uu^L2~8Q zJj*#lopNh{`vJl;gf|J@;Vf`tY`PKNkKSkU$NS7Vte-o>ug3@O_Y)H42kNF1x3BLT zZeh-x#~QbAm2XcTacGr)PsoU~_8jUr%YS6>HGX?eeZYC?o_JLwGI}hzC>I$`=G*a& zk$iPNbF-)V=3)noRo{54zU9ar(pT5@=6r3k z;u#Zy;j6F632)-;>(L!8e>CaYSYNI08+X#Qxt`m! z{{(ph|MG-g)Gd7xokR4E6YR0}H0GIi#qP@(Yu+8d*(pnjd?x3Yhu56N_p*Dcw>N$= zVV<*IQ;(7FuI=cJpZ3B}wu}9WG45yZ7p+C1LrdD$WrRLupF(I)WS}eHGw-r4bnqhj z*TipvyDj9CKAOB}M9mTMg}|@KY2ERl8yN?&dxkcccR%3=utm-qzx|iav&JHC`7RL| z$^6co*bklKo<|)u(9-)8kmYIb*I!Y)pEY2N-Jfqq^(ROf@AYbz(1MeO1rJW0eeUqs zWB;cfa3}i|PTUE62d`7RhDuu!?o_*lFZ#e=GjlGJ`y3iK+>`Aa4BoDaoP!#9oc&Iq z--7RI&D?{_z%RTz{@geuHthR)fPF+1?0LZ3e6_i*q`o)3(1Q$ql(E%#uaKiXUQ>=f zI?9xzoomiQ1LLc@$Pmy)( zrH%yXBVUV64~MABIYvScRadHAsl+|eLnHpOvH03I44igds~c#?IX|TT%X7_nEwpFz zG~xd;pI)`r_U^`XZ#R7PX``=Q+pmsl|3=zeHQ8)m`r6t45QA?C48tvcX!1yNp6>)M znX|q`boe);b@&@k1=wedg%|P5kUgQsIVO)Wbw<<96M)t-4H+i2<~3`q<|2m+48ns} zSz{#l+i(4zd7a4ECQOpY4_q>i0qS|y$TPHMue4?Gcj3im-VE|dp7-G+PdIU9^Sn^l(4ZbTD((u)juiQmpm&v`ZlbF zJtu<>WnWC{kpAU3#|0kt);&{LgM;CBe-j9ovYzYsmM5-Fsr;C8aCwVTD^oe&CUda? z`*BOEYHP99>Ne_=J&2J47wvzawb@*&&2{K6++QU46Z&@2wUfv7EPraoex40?ngw@` z;L?J-1zoL_`m4_qe}VorM}L-U++}eTolDN?9UW}O(-tLG`sv5bE>D^8J&`qRTY|tp zw*&T6#%)B6+_xpLm;Af1TQu+{_GYQ?4ucMc*5KolCVOjhkqL5WL)IA41@5)l?QRbC zjIY#|%+cmJ54Y=o*ApH|{hf0ywPuCYMxM;4qtWx}>&CuStodZ>1j9_7z@S}IhIz#5 zn~aI9#q1FneJe(Wm$7T=mm3zG3eEl#T~=uH1#2unZ$I~1^lZxl_eQ}WyzY3K+WiJR zz{_*h@a}CH#og%fuh5SPls!1qTv_ zPWE0Do6kLD!TCU%xmMb6I@d#64jpR4X?zRRd534@8rIK(@f$tpV7<|z{BDdtbHKKP zk7As}XX$cl4u6mFk$EU%^l$M9Yn~-Yv)h*;#1f*$ncM_x5$7+Y4=tb1{S)Lm_T z56O^K*{72`i%fsh$Z`BH>ge!=Vm@~AnxQqzSV#W-J`3Nn)@iHH!Uv?^vWK_-?5cPPj#wJWGb&Vy+ zxvC)cTd8X-LDKc-lBTY)1mkzoQkS0~Hn;k7NTcI+S6}^@$ohhkcGOo7pLNz3lr($b z-PIQy60(CEeE-#O~s(uO9uvSznNJ1bm#a)|Yo*Up=-I zXMI6QJK^KJwX?pUq`!x)_^ft={yfAH^p_S{Up?piob?4sJL^;Z+vofMo?!CF;WeX> z5pO!(K3HrcVzXb({&j)nPwdpe!Us&AV5~EiO~0~F5A&b=<7h|DP#XNE!$&!CO?n_k zy@uX|%-6|VBFo%O8L>Z#Zr->-d{&0n4E==f&my-rT6(@yum2{zS>$DryW2i($5oNL zB^>spQH$E=+(LaK7YPp$dBb>ye`dby3ExP$7Rm{ZZilDz#5?nWO>_n)p781IR;Ry? z{5GGIw&x-@O`yG(X|L4s185yDHgq*dd?Ms7VeyG5?NeMQeh~}kZz+5DI(`wJujEYh zbF3ou>OkAzzT$5(#Nd%fv(#3-Z*g6QHn?aUJ~$ruVW+=IYD+PGKg0g+-dgScdh2sh z_it5+oy73>zXpGaS@=y{jsL_<{3vGN!#W)w*0}QX&WTIQp!}9b{M4Gb|94E>*5E(f zTgMV|c|U{qeR=;?o(l>GLD@G5&k+&{ODKOMA(t?Yu#$E*6Sfm@q}bX( z=+5uhIJ&6${|_rKKkJr(~l-@`fK1qlrm)A#H6V_>B4zd(I+nu6o>MT#Tmg1LK%b1{N#~bewviqiZwDbvi#{08*?=kag zBX~}<@+6aIxRJ*rd4UnJMbB3(gOnGt0bpPMwpk$CnP9 zjE~t-_qMOImKAyF*GJ5&p3aY!o)hT6ODlL8yJ}_Ydc!yKIz!%1#6L{%@+0t);WqK| z1Y@usd=zsgxYHIEdFJS@R}5d5HV^fM@o5u38PD;x2UGD|lKG&BL$mQ?!Dm}|kUmqj zg?>mHpWD#mq#q@1ohJ!d=Sf1=d6JNIo@D-P(T{8&wX~Ys@%w9>jg9YXk?&W?`>*l- z>yhsl%lohM{#%jnL-PJxyq_2O9$%gZ8|U%<+mY{YlK0={{esB%+_V2+;{x6NpF^=|O6`wWnV;hL?)sud;Rd^hCS{3a_o)`ZrC(o+lU9`_v4t46xCr|78 zgNgXUPUjr;bbMhK;tRVOU)ZZxEyPFC(^7QdW_)2cvmcw-;S0;Y`kXv`VTBj_w7}yY zd^#s9wbVagTZMmsDl^(%%Qx-OqTGo z;|D7Ke${=rFGg{13+*k0E(D+N+%EVWWZ-i;_!N0pe2eqxv*7b1;E;CyL^}gDPgz}# zR(`pANZHp2O|;wK(#oXmG;l2Q>>H!wwm$W#^8ebU>gr}|;opAC75?va?(kFV;=%%t z?LRK}t+nO{S%aQ6{D#}{uRp!o6+XQ|m3!Eu$470K&mRa+2OrYDz@L|`$`i$&q$QRm zGk!_*<^N;u%;Te~&c=W4On{jzkdTCIm}C-`Ob`(TAqr(CfU+nm#FbVOVrxPK#ibPy zk`P)G0?KII65j%%l^IBtSE!~nC1AB+tW{LhYF`7`b|wQN0+LZMzwdKrCK*B!waf3* zKknz=d+xbsdCs$)=RD^*65qhd*jTZ}TepvLxL;6Cj}hB6@9LiO9Q$aGz}fG2Eqj{+ z;~pk-5?(!na>7R{Cu>|jZJE&;Uan&1McKY6pv>@miw%FKKKo`@MHZyIgvQlP)^}b~fho5VwALeR_4I+HgW2~wTAF4UiP8*zQ zG0JBi&pKI-YDlw(wWaMewxwM`91w6Q)aTm2x1g*+-{(LX2Uvr<1)3LH^ZTU6xr$D} z1Mr6q^||g!pC)`FrlIqReq)A4b)QaTRva*jXU%ONaSrx9mF7qQ=2m4=o>+R0S?USATnGczU5-)~$O~}T#XUIjYm<5~>v)gK_)9CN}nPD}>=s-2- zJL`5?y{*7i`e0geR~tNZn;HL>Y}N>^H^kS?`;ppObgG@P^w0IAJgIy7(1^MNz_lx{ z5S=0Il(L%F=Q=I;qsqAaE;B6d+x*L`7^g|ZLmsoWJUML{;~--YfW>C!CjPy%%t3(( zdm;CZBmcv`b+eD9t-oU~bh^}QatsDme*soEZcKBSfR*4_8vfCV6_vF)$iq$Wy-B<; zVD7OAzBg#e5`6zf4#nv<&lYG>{$bx$$;n-1$$OA<*v+ntM{a<0?u(Iq9t35??9I}yN_=-Bq~F74 zIKxMRpRDhU%pr~@{|Domi@(e3xTn#iA^u~CF}Rnj;hMuC)dMV%eX>$tLZ9mb+op46aMwWXLUKB6z*T!FD~k!LE*EZy!Arb)iurZL&5vl z67$xFlhgXB(RUiElDp46~&vx%I*F+1R!g#Kt^)R(?|+c-9S! z({^Tf=QLHGo&PHGJN;uKb^{rYw4E1v@7#5EUS-o9Yk5wy|3fE&ce*W& zv=PV*p-aJ)QuLY)z#*JI&J4s?~B z%$;Xa$K}+q8rqh9G8WPl*f)78& zZ7-c9AH|(S2bAMW_8F9| zxT#I-F85QGB1Twc-`tnd@CVv%s7&6+o&=c>7ySVK)knn?{eW^S=87%ijM!OzTSUBT zhfzb{J2P%&P#0VP9_DfVfXl)4q9v^6UhWbno3ket_I8#p@vVI_k97C9wV}RE>lq2r+x=Q_dt8z%5 zc;2NPJul;#9?Sexo3ziCq{asSF;kvN^SlJzFWw1GyCc-;_7wJq8X5ao<;#M9xf7JH zv>)En;Is)FPW*>U+cqKlD~;?Oy4s#I4S6Hw8EChEA6J~~J@$mOP)^Ha<&ZsZiNjTm zkKi@FF0odpN0)9{yZUlWy_-VD;}J{o~;-Y_9NN%wE$XP%6T6c#Qgo+K-|&O zpH!x4DJ96PJP&1A>F33y%`?X9X`%}R|BvARNBLLwftmXG_kq0*9qa99A6VD+I%%(@ ziEbtBPLs1IfJa9^Ew-FZoR7kpFCBe4I84<@oP=N>KjFP2KX8%!$3nip&v&V#HyFrR z6@As-F;?oUeizAJdin2+F$NwfA>*CKbFlB>_)P@I=@{czgnkt|HjDKAKCvB6nhGV=QCxk!l+! z?K9o5w9Rxivgb!?vBrNdKr{NkC!b_aKKGl(K(P^gGz)p*chG{ zIvz`VXk*#m`B&gcJ@M4DgEOwBydbUY{(JlO@yPa8$`>AP0@hZ_7dn}q1m40=dOFwH zlR0@|Fz&nX+cWn<`|S+wOvOA`_JtZ;hSRJ`QIE`l?9@?mDqvrN2g_MG(4%j^o3%y-@={Umce;aB@7E1%Tk zg-(lUuf(5~IiSc|@$FuOY?c_uTadd88|=>8c20IKMz<(I&W=Vl$QLVEqCbsS(fXIqYNAVWHP6^Go5+9=qG&!I^1)Wll9KG zEFZ*x8*#2;zDax!{rhyjmni2TzVAyNsDGDzclqQI8a9i(Djw+YWGd_xa;D25*A4QG z@zlSSpZRTeu--kCdtIR3$NlBbi*vX)_hHUJ+ZX!N zvVTO{b7B6pD+2i@NASI1@drl(d=e*Ct=i_^w`-Qg>*U5_mY~Dkx+c!y+1%F;I~jxL{9|BV7#JTJ zgJ=C?V7?-dCS&lde+7z zf$ul*~sFyRpKAU849v&6w-xCQA_l2D_vX*wG#QAxK;D&$fzkk^$Ldsl9 znM>i`e~w{;l)ean3HY`;^S)gMkpTchhGHoQ2O~--h=Qfrc8ZYUP^GB zm@Kj)-gOgcEm{7!N|o>!H$29@5xtFmxT{q|@l!UdCv%{+rNZDSCry05Bvw$7{FixV zTA`%PIWtW!fAX1WH%PuhgF}OV1T_vd%fQe_aOlQ}rM|GJ!TxX7~RYyZP^%RQ>}_`hUQxyZTO$r)xVuCr#nYE3h*VUrWZ^ zD`|7IGt&M}n!qj_Stf9>{%DZjmia}%mI;hxY^GsTc?et#()~>AT{6aB0yEYS2MT^C zF%DMB%IaYpnx&mx#^E*6vI66f)x$V6%70zP;U)PmFb-KgjKg!?{O6MY0^^X?!#Hf{ z=D#Q9zrZ+T^)L<(ck|x^q@{Nmhbl?yG7c+A6WNtMs%c`&sIrOa7jnOl`#A37xL?Bk z67D0ok0_fM4qcsuzPr-!9a0WFHf@ADYC|8|eeXSOyT57QCVA0~blczvbQrNkNo?j@ zmn>*Y!|t#iU5{~Qy&&9MfDX6-{yq;{ZYDi~ytgvGYl!6{afi|W^z~%1iAY|t=VoaU z5yEev|Ir0sOMDv@*UG!ZmC|Vzo-Z`p{8>BYVe1sVZV9#m^q@9mi!OI8i4p_HFXxLA z9oh6nkZmhi07d5?W`on7nCXM9!(~Hyc-SJ=0c<2kS={~=kAXe6USDIvc0k{bCR^(xqTg* zO2<2Q7?Z}YHyYkP_$u>i>?Te4)cwYo*tU^04c|E7$3dURk9YgyK&#b$9_qKJ#JT=T zd1-2pSHor~b?)T-*aOTDvENo9Lp;b3Szj;bdywAkVpF3%KfTs&6w5T+;*WKFV+U}g7JFZG&U)yn?7%0yq=-09!r#AH;=t%treV12Pe^QpSM!C=+cbc^Ol>F9ecj5-s8!RCh`=s zep~(7eeV=UnX9d=*^2+Qtk=mr&Q#9&3gam9c~f94Z)Yssmx)bA`P5jo?VkB!D~Z$h zZ~qTx4C-srA1LdI$;vl~H4X9ENJ6%X57Ha7d9~OzMZ-9}MwW#+qLvS? zJ3Ka)GhCAET+~0FOU`RKG+c9b^lzT%K_-{Kk138TfW=8<>?eiy64>uPT~Jf1UI_r0=Lhk3Q~rQB|G#ukO?XZfimISW6tt!9sV=f4)vaUhwHtoK{NERaebj}lnqf()6WhCq`JbdM046i=@i4t?clM9- zulXGzUo@9q7GrK=%%!Z#6YY6t!jyR&`Km{z)NO{Bc`~QTT9Geps><o%6b&OTmV8?;MetkOU72?MrJJYlzZ?JAP1o>=Cs@`yu z&HB`9Hs^tv%D3^icIQFnqEG)m*ZC=W&ojFwJO6eaF%DnX^|sOJ#I~;z9S+*}yik?w z+KQioW}aEKU_slk4~yD1z3I=F&$?Jo`O?W}qF(dcx~v(!9vw|${UBFLT3=X|yqPks zTKLRW3rgB9`>?pp^q#-W7HrNvm3dLH%n$u#rjxg)GUrN}e+{&g`9)7)x0Qu^=lm#fI%Q@PVA7yjZi9rl-D_**GE8+;;; z>E7Tt>_4W*q1zh@?+wS>^VnywHwf1P@OhBDOPd1lvk>1&e6oXZ+RyvoV4eUxHV_vo zRnh|eE?^GR-lcDLA88@&w=j1V86kDWS_}^Ny~OZ|Qj1%>{hVK}>+3vqq>uBP5z6Nw zzUG(481^|vyNc-ZN#g+5DPw;Zaii9jToK`q@x$6xFvd@i&+O-1nyy7Y213XdAvX?VE2f+^$ZVlsH!poe=>ETZIIQo+w;dDPc)oD(TukE*ZA=NP5r2$;zP2`0U$dikio89&bPjscZ1ktA(W9%QI0|INzr56 zw3D>w)zgC=?R=NC_juoaz;8eDf{P+s6G=5!)b71s$GE?U zyf$^4K9?<3!m$D$$m^eQ*Oq>79 za-K7IZry>ch`ry3UQ|xjdwlz=!xpwYXj~|;`H*&r{1qEXJNohr^l#C(%u9aOmQ@aq z$37)?q?xSQzfPTzudb_|ft+}K!HBkV@xN`w=l0gG6UJp6#N#Adv1h52Y1c(8lHU2T=vL7dzb zu-R|G=M(s_Zs=?2r#O#0*7YdmAEWJ?Ds;c#GjOSZGW7BNV3Y%VaCCPz)2Z8D+w@pU zSlqs3&JL&OgL0-*6t-rgLwwsL_E~p$rRzq!)3CQ9L#GyO5@ zbzJ&5%~9TA2Z?tLw2<_^qz85EWAMgi{6j>iAim)9 zCkI&^Z5uU5GP=V@ye}<`5kqKvD)X396qxtWfS>96?l92;>#m88kqySiOH6&F~8SwCDxnI_yz6Uc69Wj+4vb7qsNK=>Tcpj zUyJ|o?ttBIGje?o<;nlz2cJ)QBj{_ZF@iV?5&9hQT;`TDnNx~yv6MHD_3#78ztH!ktmjtGu3S@phv;ydA_IB*ubG{-=GS*% z|86)|C_1ae{Y-Feps&R%rsT-n#k4HB?sfDni8C{;xTvj|^~hq@IXt>OoHb>EPhy6n zhJOUdg_dlzTWnNkz;Hr_qdFvy27Sm`A(P0zf&6k7?*i667c5=S<{6e|^j(w&v}D{FO)Bx9hlbbEabhungkPGQO`C zIuKYKR7-Te`(C=k1nk}8l<#G}F}C{LekS-Mw489>n%VC!tiMC&WgCY$TA?SM7W)tN z^RrPy9VvoSDMS4@bxWornzp7BqeRweXHjRc4B4X@+MXNuCUoBleoaT8{lsAQ;xCjp z4L+Cz-#Ez_wZEX{xWU~sV6!&UF_wR=6;pDuuoKBTjo1W3$IQdpk`;SN*D(`Z=sHFj zl$C{Vmz?Y6=ShMSiLSwcbnzXrju3w@{Ew&yU%8`O&{=dCw+#ughsL{x^z;v7UV<(p zazS{Q@MiWgc>jdXBz-U!;P-}(lL21%W2Gjz?qdGa(lNfVuC}DG8j1TLu}E@5@-5s$ z?D3`yV#x`=leVY04wF~v$!Go`<8X*~>0{Tvn32hoXvZ3CEfV|A9)d4N1OG^@68X>X z?-o(_Hh40vkFDVte(=Hrs?np5XDWQ}n8SbzHzMo6iO-iV)p6qUCm$XvGVt@9jYA#q zqt8$NeyD@Br_c8-7~+t%SkIrQIGMBPvP$-9?9=5x{MA4mx#NdAG8tF-uH!0whlUqd1GAaT7Z1U! z4if`;6?8NU9{7otsPnDWto5Pu92K4=ZNHW8Yxur{KDK~AC48sf`uEoZw09g^qLe+8 z{Z}5LHU0xuYxi9|)G;3U@D+WU%NR}I0+w%`+&5H*p~Ud?Ku1~7QK`s#_FHA?baZDM zc2u7mzcS+Gbo5JXIG@S*@{KwCS}Sn$c!ASDg6lJWzBpI%T<8CveN|`t|9^pRz5PE^ z^33)BPdt({{(m#@t+)R#kv!}RG$AvJ=(9eCz(8c5^r12Ey|*&5B~PKhjLL4x*b?~O zTNz1`=N5k%o4P5ZF7Um#GE9=E#9zklZpv5{_}*I?Cle{-Hh&rJZpv69W$?YXGX5@Q z-03gF1Wt8@X_1t{_uk5QPs;ci{tSoObWDM)|#S*IgZcKoUC0q_5Arm9Tw^kea;CV={h$!z4QunvdhuUW}>6bKv$d2 zykHu(T%&Hwb-ZV+DWq)i$x+z0%lfIY_@^|O@n44@a{jLRJbgNQ(9P;})%M5tdOrQt z7gdo>Uznr#mX5EYT`ke`TumOk;gtS-4bM9br=~L|=CSy{NEr<~C#v#_3HtL5d>^V# z6N}y@s zrXAt5!#p%l*A>#1KBuan3-gZxY1^qkscd4^IS=l2#&7w;s^~+YUhyUQ@;U=>!pCE2 z7;)n@#|id?1@~rb0h4-cB4d>LiG;M>pnwsOLxGS!ed1>Z0FmSr>Ki z{4y8cLpy~Zi%rWsK#kqhH@;eYev4BBdJ=1tCNuU}E!lB9`NfwfpFL5F@TqQEWApu) z_5aVs*0|N~eEpfp&OhNV(md4W+Xj4V#P8%LwKWS_9*qujX#D2Y8J7dtGBlN zr`zzDw1K^2zeWa~(GL7`_tg7g@&+)Gb`-G|6YT2&Y~bzz__uYT@ePw?y{6M|GU&Th zbW>S___Ukjl_Sd%?>}F>{9;FNtRjH3K33&@9ij&Y>)A^^rgD5nN|m#HGBM@A!~fX< zzOXJ5#MAw)?XoALTU@ExMcxIy<6pJpS=4);6_V$U-t#PxJU{I{&+U@u-cDSfNxyr; z^_$5rzU4aZF87yRd2N|K&+m>lu9xyxe0#oGl27*9L1$9l3FbG!@^V9He2U~-_3imC zmV6IDTMND&pNj%CwH4eM4V@jC+C6Q_`lsN*nfbmSjeRl*{9g3m)u!J{o9_7UYSSaq zrl0ks;3oWXL9Dhzl9=Yh=jgAiwMZ&z}#-kb2}q z)DNQSdct$bCwc0{ZxLA{K8#&u$!=YiL{$$JSgrW)f|ZgsvETO$xza_}>T;zAICYh^ zJ;CYw$lQN-nT*?xa~SlrKOe|NsgoIUn-sLd6YI|~ z|GEjgFP8rYw>7cGvi|;y>zn%8xnENM6y*hNio}ZBI}Cs9;JcjDgRNip+qjb$vyy(# z?}%I@4*HWx?jB<{cNY9tYLS5#uA&0?7@|O{G4)R&xwD3 zXM*bOh^dl0d!I2VVy-tI_?(_{O|nZuv*P1dtyJ+f;*MJKsrFK5YJ@|%T+FVr{> z%&_p=_P&^BRh}_^?XkAE+g~xi{rAU`oCh9GcFG(_d}}0zmV0AAM=0E#;DZ$(chUFv zKd5|7vW{w2V_V-B`=sA)yYosdZ+evfxj>$=v5^0$CeQF?dEPKf%lrDq$yRp;asP93 zPqo}xHu2c6rdmA_T7%bh=|~UzgFH*MhQk+68|k@V`pA}S+iCI1YPpg8U#mtZwsC30 z2yEQ=a?OqgHZI1&TVZyHjr-OQ$G72|74rao#Geso@WIi!&Nn&p{VjWB-Hd=ui8*Z^ z{-?e*tmP~umP(p2eX*>ooO;_>BW*hzY};Jgb{_5d`%J56xYpqL3D0x6XJ0YW`-52{ zJO73 zTf>0m15%#1*cy&3i%zMx(x#zNDMzp$2y6sKpI5LRd@1nPqP=D8%kkWSf1+lqZyndi zVLSTZgw_j`gIh-pA|>C7e$e$#`7LH!GKk%RM~K zDfr#;k*qJLxbA~K51h2IR^Yes{*5@@)B0hf3GNH|TKU?q?3eQyHqPE;DSGl-qECxV zJyETE`Wy<~+D-eqo4bmA_Xsx0uJUtr(umaF=B?pztlb(Dv)d}PhO~Za@v%8KI!A!h zvUe$hc%CN%v~V2S6h0|-gU!s@ORVXs_=d;0feXIjCCA=j-TDHR7t7r9z?^=LgI5BR zKMPEfUHR-UbYFQ{&h89tY}qXG0mD0Y5`!}Uw_Qi^Kl0<3ZnGEv!ySgf-hAxE$L_tm zP1bM3E?%7NUmJ*H?#({Mqkn?V#4qB*@WkqT$`$ypq-{kfm%n2vDSBCjMW&^fWW6kV z$<`TCy!*+YXiTYggU`pZ)wWyrM*E|6m@|G=W&&kg8@&Al@D~3%!{^s;%%yJ$=_YcyU(AZ(oyP==M zr2!h_e(EveZGFV~#o+tBe5>a2?!Xqv-E&a+rpq1vBQ#tL4SS&BuZ?lpIvtiM-|-z< z!#cigMs@=Cy(ZvzBESdrH3)E^347Gb_ukc36^IYNDi9z3BK+8C-_d4pMAiZl7%$I3 zn{WJ{tZIME(^%JG=EZ91g)hq*z$XU(8mZe5TkQd#+%Kv|p#@WbN3=eO&35Qj&IsC^ zCH=@ubWR(3b;rB=xmaiWoPIg(9d_ML_U~@9T6*Wp!FEpvN4stFV>X|LjqQP?Ed6|> z{qvNsZ){i%zD2W5(Ef2?cMQ50U0v?7#;?;qa4{17Osda~TCrudofs6fMdI$r zy1Z30HHR}UZpzl<#u&q^ZBuP8E1M-%O;s<;T2?u0T{X829w_)e0(_Tt3BFpXS8UxH zmx0U3H6*^YzT%A`vz~fm=&T{}yS9|`yq;&1UDih9yhh3@c-Jz^_HO#DbT0L7`Xcqg z`ODWC2A6oD)t&G%-F_aDzG|Gxt6pwA3mu;`w8a>E%+au(CLN!ja=jIf#7Cmo|YM5s?8d(>defW3e zHIGbfxln6#Q;&&q-RKW$Cu_QetQmll;p5LsMK5bKkGH0ZZW2KpGjB6`89u?D>y&TL z3fBLw#BUthVBND3x^HM5p*5n@H6EF3Z#+E5-WUVDjG?}{{aBx;ACqWPI(-^HFsx=j zbkW2&d2ank&Wcenl2_V`zeS1rMO(vq4|oM1oTix~-0vuTkGqHVW|gak;Qv|tKZMJK zZm7}jE6`zEfd{(a(PqYM9yqm&J2Gf5V>e3t@8mu{aCct^FTD~!b8~5ZX3VZFmX9rq zazD;kRQPe`qLm*HSv2&Xf?2-`OT5W)PeK3T+^^z2rf=d+xxCNj{Z8KR;=PdfyLo?< z_xE{U$@}BHtE_@lm30f(kGX!rg>O~X9bC6_E#@lZx|{1xuDiHy=DLULr(8cv1vigc zpm)8U(2)3|q(d9iG;{NwaC5ZfY397+7RFKD=ke~Y@Z(Vg{xiap zWL=!xOL`7>Y_LAjy2up1dRqZ?hg ziMdg4x^U6!szVok9@|8aKeltWP7vps19DLI8HM7L_)+wz7YB5hu6ptBX=iYJd((@{ z%-N|j77CuK!DF>|H!Qb+*9VwyrE@lS7&1_JW-Gio9A5zseA)XBer-{El1=sI6p3pz zNsB0ngjVfKz!~sS{6e%cAvf~uBxAnJsRd_lNM3EfPES>w871Xkbl*jjZ?y$x4>LF zvM^@Xxq`E~;9{XEGj*kD2)qKE%o3iF$u%T(MvU4jvWKpt6s982UroirdwDYqHX^&&QL~oI3r^# zvRUBOGE8g4Hu}73+&R(xR2cTrvWe}vhMe|;s-Y5n(p{%~Ytg4-`l+k)?UvM*Or_87 zOz0o(+pudecS}Q`F}=Dj8M?TdeG3hugSv;WOFoR;{|vl!YW5v) zGWd1VR?^A_*d8i3MmLX8s<}jquJ%5M|Ed<%T*^OQz8|35&rp3F$+0p43)P~uC(di~&L_h#hL z^5qYkNpFQ#Tc1_F8Ak2{6lXqjkC3~mx^+~Ta|W?#TGOz%Y@Xx1lKlBxlgV>4R}pz6 zri}@h?w=g)JX)t3j#P7|PDMZGrYxJw(2F@Y{3Op_8GzltJI}EIcb)WHV#NzM5(; z%4{XhJU|0ZC&bg8i+v|m`jA#)iTW9XoBxL(Upa6y@C0J z@FEXB)UDJVoJ$C=n+mTRZZEC>6dq@MbJ(n@@VnvmU0V+DuJIn^d9T1%yW@Gn?@DM} zF0j0HK-oZ<7gn0o80J@J@IT3y4g4qB)KTGi9`>JrySlG}XLFv@5LH?q#(ZYjIi>aI zPu;a8_v2xUE)5%e)9kRgo92W?-cQyKfMvX|Dg@fv3l z@}$xj7;ErQ##&&X%Q$E1yw#W{V_nWzX94>^;rDVV0E?_D;yf86U2iO>+MPN zx2LOY53w1%sg>mH6!jm!yVPeU(W z$aNr7<^~z5`v)*?@DHIS_Whhm%kT~v!+(i?z&CXMQPDmB5SrF$8JmP|8<6^-WxMu& z6EdKijOel}Gqhh$uxp=8$mBUwTgP+L#Ed9g;9J>5J8{1BZ+*%p3Qf<@*c;0wHiK7~ zkLdJ?ycN0>8WsD878W+k4Q+ZN;2CGor>%b28yT~PzmYlXeC!XS;T6NqnFQVHJX6=f zgl;y4(oL|B(2wv)J9Lww34B$WCBQ3%*B-hK`50nHlX)6_#Bj>+iRnMKV|D&TjTs z)<0Kwv74oTN4uH3NOjoFp8j3ueBl`99Cq}*7rU98zDwMVuC}uz=An~hei`WdwIQ~% zt+ZEcXFHkasj&WOP0abE|C5Zdx-DI7Xa8XC)?_|6)q5R#ebLv&ezqUn%bZ|JGen;Y zJ!Phf?M!6sk?Lv4>dTPT?5BnHg0?lWwTT=?#>NHH+}NXo>85-&CKQ*&rWKS+B9HoT z_2s$@nKXKOY5lM_htHb&=J~TOLne)$zH7@6-Y4@ODwqE7?vPm<-_4v=!DXOcDYNk1 z%teI(yH{^=Ck%U9-*a|tIe%(t{RLO-+7gs2HOP@iLgdPCcz==i*FL^r(O%@t2fTmC z`#(OuaM3Z|zvBIC-V2c{g~*vgUE4te{E!>+9ACc!7T14|g z+OZj*RB1~!^Uy>7x~o4s2!3l()$4(GIhPu5NzI2}$vUj?tI$5BhxTcmzfWt>!KFW$ z^k=me-_e(FMPG)~7v$}~qc2V9V67LMIkU!)&f3{llr7<*|yI$||#!=}3TTwoBP zBC8KE7NSFh>Lmi>U!a@40>2mCkh$oVUjXx+ytnXQb62o!VguMB`l|2(%f~|(4TbJTLw6TIcR_t|4)1wF zhXH-@Zr*>&dyuXlmt>ced703OqMIAaQT}dV=sLiKX1a`AD8@dxOx9{GoDFI zX(j*kAYbCY62?LJ&>`R=^CH$_T_Qh%vPf|Ld1ON$=0YNS1m~Y;ej;`vA-WV=s@lT@%c{h z_pT6peuVc&c@NHiUgZ5H-h*=LL*DSUx!`jyG>{8^ z=Yrq4&_pgYkc&*pMONh^r{JMFz8`w-8s}#4z1kSryoT`+n+rU;x*qs_2JaMJxsh>N z$u-{;mPULH9p?k_%-``|c)fFearVm<#S31}P0UP{XBpF=P7tJtLySWq`t3^i&dt#) zu3U+JTZn%9W1fGGep`rsyE5Ju_*OQt5dBvFhJGt_Qi5)~noH+N0sVHy5%4WUznzSJ zy9#)FU&wP#p`J?_+uIrIJFc+R-wt0}3}3qgyWgE}44-xT)Jdf4>wCI>D|G)l`*?)z zZv_rs;2<<#1so2+`>U{V$lUN|blk#dLt0^o%(xys>)UJ_4@Y)y+YlUY1*gqIZ-U$4 zdMIvp;Qr6r^SZ7hYNbe#;npU-3VuV>=JUaj)PBV^9E& zuL#WHR)FILfjL|SI9>pbuL#WHR$yZ+2+ZLsdcpApU2t6Ha0>!Bo=ZJb7}uXL&bLoJ zdk*&#=5V(&hZET*`jX&yYfm^1tqYD@gw6x=xPlNIzX2S70~?6U+qQ5Qy?izI0k-ti z!x6SdvHOXwDC=kKW43|QC#wBtuj$Y8u6DYgvgMfALoWW1KNQ~-?5(*!9ELxOKR&sa zH21G;Ia%a6_L1M8VRkUzlaeo%J-zJ5ByE%NTJaSJYW{6J#c1Oeau{%;uZ*~K*af*H_ zbNOk;_~tb9(1Wy9Y`acuykfKXu61{{KTgN~<-!Kp)z&HfbkN5S|7fgvh3ndI*0pP} z$7<>VIe$yw=YLP@c<*}FjKqOB9z~pwFV1(21%{P|6EDdeMsyqY*AJ{TeEE_&BCJMF zCyqS(^Ycn?#GX#yvdC|;ypvpPUO$_;$%1s!1Q)KHte+3XzwVd1o(EGpLoHrU*`?Rw z!wTLjoB!Mhm0mV`{hXfvJFDHa`A*s_<6dMq-qB`k7z2yw!^glvPY<+NvG1^cs*xDb zWwSSP&*$#sF7>Xt#$RtyH|;gulQuBza8g8M64oUPOvR`B)NKav)(HNFfyaHo z=f3Py>_>b&&9Og%_^a}to&OT&+}mWbeU(2<<+b74D}KHBfUlJ^8qBQ24o-L0)kTyw z6h(fNY+93&GHs|D`=zmu>+k)PFLuDI$wdeHr0gD|#vV6@xlYjM?esZ^K5sJkI{J*i z&%jNFlP|5K&w6^G&-3Pnj?-T{#>s|0DYlot{u4=2ROGji*{A=e^IHaPGWS(8t-S&?7W%a0`xNPI%4r-{}o(F3*dmXN9uIr zgDve^MS0T#?fP`1&8hZ`ua>=c7Gj`@-@5LreS@)PAmz%KM*R!rWPuOFt#F05S0Z9yJJ?wBWG~-8oMNnzbAozm->54%d$O1BZ{1|9 zS=r0?<2M^??(60Iz9M7IUA=t2oA0;sotVfE9U4W9ls@L_5@gY8^b*mV53}A?)@OOW zrO47>aixbPE-GgZp3AeO*?AY;q>yJxo6mbijJ9yPSQetEu-eG$V6m&O3TXpPxck)6p%`(JfR{W@_0`bze}wN+Q-}C=Dn# zv2tEB*39Nw%XKeTdxh-((1V z=|5sGxWaD?-B__VTd_ymL>v&)Db89uCHuYY?1lHA0p~^+F|A2(c(_}yPIRPQuNJrR zJmV-aw_{e-rkyWmKvmY3>uu>h_R*KmP4 zX^V-&L(DhjY9HZ;%S}PJOxAR`4CKA+I(YjC?6}k?I+D~ki}bH}p7)^Ewk|{Y9v_Gw z_CSRnX`;)*_h6fzp-u5Bq58yd^h+a;8#!k-`uBL0!K;Z|I;OHOcrtO-Ui14Kzi{6E*DtK$Engt>y_@7a z`zAccwEvxcjX%vmnlFVobi{r&^)Xc^aBh~IPx(L8lRlescIboHrdBZyGnV9XJ^*%! zl=2*JT6s<@af7Vq!1sxD!`71KBk@*w&rI<=jm+6rGYxe>i0NTi9yw%hwhbl4pp={bRX1)seaw| z+hRJju)j;)dY+@6GG!mLz|2OwiW1c}@i`jYmpG@;M$V21J(hg%`SC<`k@$W|{!(P; z0c6WHM&>!!4X!(oDL51BdW80^M>j7bO)(!2#gW0(=N>NW=gL36;E9Z{%n<~3KjRcx|SKnJ{e$-T{_Mb+nGOzr(gFwyAAoo zmsulY5!0F95Wv|ZUGsyJEyN}&m-3@K^XCTIbFgcElRBDD+@ias{Qhdpe%c>|6>;Wj z%1-nfw}@|>I1eP)w_sb}3X~NOyn=L9AbRbJGil~E@`ch&TL7lA*T>DdOlcO*WkSz2 zTLW>Epb6mY3ulaH?2mKE8K~G_9+G&i#0>T3j+HYFlUzx}aN@v~ZK7+uO&v|_!`sbz zX%K(T3E)r5AkLEN$2x%2HC8R&g^5-Csc;Q+it#_sX1uu_R$b+|n*8+b^Y z_tOqH?KG=z9uoR~YBYPPpkL4X_#acJe9Q8-ZGs!J@2J&`i&D9g{BuWrIvE=b=`A|sZJH*cZ2VY#@JZY zISvD|XeIv!`y;WoUZzjT(u`hMb?{{V5#QJwk*hn1MRoxEDgvHTc4)f5 z^CY~y(x5I<2hZdOiIjPQGDC4&5nDz26RdXtc|vhJlwU-t&mWSuPoi%bndr8cu#avr z{mcN@=}&8#tnm;pFGFpcgf5&9EW%@nS8U;(xL_@`IUL(~w!9NhEMI6O1s|+*wJqXV z_K$OJg2Y>|O84j6!kLG17UDW^(M(zdc{daHc{*vOq*bLMf9R*qBN^u>3pI!EN#jv$ z)Qn>#IIHvi=#jd7tz;aDt20_L@6dG=Mot$HcCP7P4oM<&{0w{psPPKQoi#*mVI@|UM{3{gyh-2pyJAns$FD@=_`)sno zd4%}b9l8-QOLY9cexzgX>GlL4a1`9@OI_XRE9-wt-0wbNz4Orz&U5tPe<|lUd1aix z1Q!}9KeTNf`ql728=^-zwvtEJkE$8u%B~=UuNDYx?}bSaTj% zBG+Nw-{;!NCGGhwX>++&a*3>bjQg!zO`MzD-9CJ=bE!yd6MQd%c6PIuUUa&6^e$Pu zZ-R#2rLVHzO}?)qM%@bRBytY&qZ#paP2hzM`feuAZuBLQl^ft)P2fj9=T5vDOdpC2 zh1R6J^{3mn?GESyLVx_{bDwJ8rpMkz*AclOaTXJiDecsE02|02{!1^ur_Dy3zvumB z`UGxbcgm+Nn@P*39Gk?OqfKt)q~4D}dD8yv;IFh_Z$CzwM?to}*&NqkaGmxX@W5yw1M>~4b3s=5fC z@f)5$>8rLX*8XHJ#Q5m4W+M5@Bz{O=_Vs2EW03EDJ~0SB5$9Uer5riOLF{j#u@~fQ z@?dNw@bayW^Ayml^!efw!yO(!eLQc89`3k`GSLTS_f*Cd@TaSt(L@{qvFS7;$Bxri z@pFn_h7UcsG{~YHJL#Xy+x|@1=xSrc&u8B*kiIW}mV4aj+<*o#Z9pbjb53{GZY7c|53CdK&iOeNVGPA}c5|2va*j^)y6u^`c* zZpd$wG8Hir^m`opLF7F#a8D21sVB5L?=)UH+TzQwI8`UPcHVjHHYNt{`rHD?E}q%4iJs z7X|NE3-4U`epiD&cpfBN?+W8~Ro+{6F#zfT#=*L8SA*-i)h;^6ZB0VuR`A;Y-1t5GPxkN5F8y26P5(-J>0j~x zr~b|TC;N9vm;PPTP5&(g=Trp??typ4ySxTJ{ps^7c&D86CHlDV&Y!_MPr?s{ckYIF zChEL%JiIg3)y#h~S1i+cXT0lgyuVN0eY3{bi48=`eqYLlcUA^?XJvqQx@Vc{_5}K8 zN>b<;XVRZ1^Htra<=??SUkLQ?`|_8E`7cy9)brkzf372~C;mAd`K0sDDCO(gzkjEE zSo%-)Z*G_V-Q7+9Zt10e#owWSJ>zHocgD}jf&P78{LJLPQ2d<8dsqA%C1v-3pH+eW znW7YW<9`l6BLe;VzWC|Y&cx3?l+zVIPm-r6{PYC*%RiyNgvMz!<#n%{Z3yUQ@6d+t zFW=)`FYuq}e)$32>`%Pw{kyhPH`@{DpV*;9H!BkU(l4=k5bH484TIPljQ`I-)gZcF zBYFLLUYu(y&tk9A?UgY-(`RpIPJM}=(J}m#PN1i@1?)VgAKINsA-tBj4trB*!~O+J z;kAjbwbU!H5c}y;cl?iPb8e`a7Ok zt4B^8Ko0bD7SkaW7I%X3dgAjZfUnF0B+jGoV$=0@XJQC^WnC?qe(hXvFYrxpji)^V zUs*G|5%{JVlRDtL1o-9z;A;ZD$vS*52EKz_iTo#Ltoq@b;JSeKB=RQCy|@m09PmwW z4UxWffp1p7Bz?_(w;`bezGi_lFy7Pw;{+YX@nLGrZ9RJmeK})?oih>J_g%lR(Z@JfgtS9%6Mmvn4sF`Uy5SDPAg}nxcI8z; zSrZ(y_xUGSFY9hS$|h;&M#iQ$8py)tm!d4{+$lV-b!M;K=qx=jna~vgTl5 z?kw$JNvuT1Cq`&i&Zo2zcmF>6E3ixfS1XC1r`z+Gw`cJ!`^c|0B{Q-j+YDNl)uf?AC+9H0&ah&raI^bxFI6v=0Mm*w~0&6X$xCH1Qb= z?Zb+1+8_IKFpc@V-f7cmG|?3;G>V;A_{2rT$sMRvoakV4 zNf(_@U@Wke|9*-s+(cRx?^fDn#XcJj?6Zp(w6)%w)dnw(`2@Q-XXxa~Khl1wTjIZp z|6p9Z#Mnx3?QJ*Ktf4JE(YMU+M}tE*3(tT@P41P}{b#KuVxv&l6B0`E-^o%qo zGM&zUI0^fLwk)Y`B>fRykk2~xZ#c_K@bpsB66fRuXmyY)hj*ciL50Lo0Uu?pTJjlK z=lp;+|28AnnZzEDh1gIkMGh)_EZL(f??t@Jdc!8*g}tF8b`t&?4I7{{!{T%1NLh`P zvxPGyA}@i~A0TgLm~%~mdRzW;x+Xg}5953w&QUDFUJ=E*)C0s6dXu%w)j3g|`ATd; z@L%Er{W=*xv(f$axZY>yD{GK^lIKsHjl9MDM(q{-g6BiX*%0A%a^@}ZGOA6iHOiWW ztaEru^Z(alV676kg#$P1)!}v7*kG(y_JhDrtn9(FQn#0VB0R4We{FLnvQt01sb`+h z-FlIU@e0~ygjb90Uq5dQJ{`|EzLdcj;D;ml^A<1@{w!yOC9?k94$qRg#?hUuC$zVZ zLoPieb|lj^=DOG%nb(~iFS$25^2v@)RB@&{2skcgIt>f zZ4#NWls3%^$c+88$w!;Ah@GOhslZ$p|8|(fE{$n7M0lsetBZi8hnPMB<8#Tkx*)u6 z4RBr!oLA%1EAiJ?Yvxt57il5iwpfTi$-XlD2K6$EbbdXR^^Pg5cTC1lHW%BG9Y5LE z^z{o_Tk5*5B)XS7U>DnouaxDb1aRBrYWW7cJ7@Cg>vyVhb|n2u=e-3wFhK)q=gx^O z#_|dK>AUa+@dbuQ)QGR)0el^0O-cM-{!TfPW`nQ%MV?7(xAP_)9JNwp3*zzM5L{%L4TCeAbR^;PrX*DWADz8M=t9LrXf(+f0mQ6|`tB%6dtj znRhwN$Vd}>&C2_CDLbLJvWbCwR@tM!d)cb@vVD}z8OgMp_Ui4nfA_LIj7^U)fp-MU z7Fd{oiOj{1M<}0rupXawujrX3;5`@|d2)E9BcA*A191+~5w6Wl(dYVd&f2ASsG8!Z zY)%<7foE}Gtdyzfta187FKybxns8TGN}Bl9ZNjHY{OYPh#m(P*39Pw+F6D!AwE~3w6^C0+Y;)zM{kFjc}&;uA?@g( z#gOsn4R^){>+Ocd!ILxKBlx_Bx+bwMB0hxK_z=!x4K0`Pe*LCgr}N(8wu|uF5&YSW z9?=`#($8&!Lh3dJ`qUKQFG`uJ#ZO7js{8$E_QA*)pqEPI&nGvtfaaB&LdgC5BOOPa^+FycctUK^s^`45=zH{Yx-C~ z$M_(R5kK83>YPxZ^mA!Lai?nO(zZ|>m;b~kP0A}Z;;RQ=3B^&X?0xEn<`wXC1=?pj z-M;-0aFce(Sob#0B1fL=tzQ>+(olD}>yb0oyJ=6;vG#4h2$ZWh=e1VaLtCmC?{NI7 z*wg;D$7pV555cO7*l(~Ozj*clHQ)m@PR4AyIWXS9PR6^6SmY6m{i$IQb&tM3#VN71 zE}@L$#_(q8qX|FK-ufx{JU^tL-Q%;I=@9PLvnQ}p6$k;S^Ue1VV@&;!vlDlAE2AG5Pl`)BDjy1;o~FxD>znFJ&du9 zF&-W013p6AKhx(L)6gp}L%+BbJ!2~R#uW6W$?S>DU5Fl{?=5OCRXGB~Z)*#B=+PF& z^SidCf0wrWa)+_zf4N@cYJWxccNmz(OagY|&jTE}u|C}<`tcM2&bhou<>TYo^DRU!b9uMel!r%9!&ppn6cL8_N zwM%#o>d);)O|Nq+?Nt0L^Rn-IPY?Xp^&TuInBLQSF#TV>n{FLEQ05yFBf$)kFq@^u_CO@%c_}Mm1w5s)$Ai$Q?49a6mtQt4eVRm zNbHdRmObVr?5{4suWkl>BN*poGjUGDPuDK{sRMh;p%p!LLb_6o`kH=Gias7*bcydY z9+6XK9f+Jdt7ANPZ{ghvY&ymx&OXZ)XPcGIWeC7uyN5krHnm7`PY;P9F5|HPUMlA% z_cSIGdm0n=daVHuZe={`xyq}ERecY!Q>kML_gwDJJW*QzEYHvI{1*4uxJ#TDiG3mQ zT_lFfj{~u+C06w<#HKz!5X)L(TVKHY!+}`V61(~l-meYBvX)ra*LTFS?i#!L7Gk>m zm@63Dx-)jQKbCc8Z0ps)uAWQAa6kJ+UY9YXZxXAfV+@Dd?wikLu^k|W@_|%|Ar~6k zT0{3x#IFvHtMIeqSF&F%z|&-2ug@*@il=R`q8;{?#^Lxz3kjU9H%Tf_cSOYVG6CYmHxbR$hF3JMvD4|Fm+=(5m#Z zt=g)y^1l<3znHwQe@p&9hUBjzZ+3joWwWodqb-ux+Q*;Q#q)I9)U}`3=sNQ6AdBcBitcl?Y`n;`AQ`^LDV!eub26Lu7F>WV~)E1jK*NDAF zYiHaL>8NJzdHIs!drHia)%jZ5zWwx5{Hwlze*~Y|gV6E3JgW0~Omye-r7@k)2ST2= z4(v={7Tfv!O~`X;Tu6Gz^M!*t)1MA`{yOCO2MHm0k~*JnO6hzq8r=DO=Q*9vAEb6Z zPfPE7&dTh3&KcVITr<4$dAhap`G(P*&+mpjAHJY7{rWMT&##W{e7@tN&gUf|&!r*H zJI8hA`EgF?^X(zeOGBPFUfh{y#3h~2M<;bYAF_8ozdfb%dDYa;=RucsKHnVjTs^Hb z{bb1VqUoLK>u0c^#PQKLpZQ+MyII*9tXkRy(g)^* zd6HlFhS(m3Mr;5Tq1)obpo-0G19}0OM z@)H#o7E;ea?4wd<8FgLKXso%CtB~twT!w>>{`vNO#P?%Nk19-_=f8_EeR(*+Psy)@Mt#Nbk;b zRNj`M>~R`}ZJ&Kk>&Rmv&5KWaDQBqdgui(QvxkayucC}nE!=A%trcJQl4Wsv`!;ZP zn|BEN(dd)@o%j95e!y`y@VOc|T?M?Z1a7mKYhA${Cd?6ei;8m&Qe!3|^EdR<@>T6?R~X86Ywwr5B3)=Ue8`# zCw!Z(MhtaN|o5~tqGrd~-N5<(G?fXRQ z%bx$szIxVpW1BL}`KqZ9TH_q~vMDOBT+_y~j&RhAUXv!VOSzXC2bE;~&i>U*p4(@# z2a>smfy>Ax>-_nw7Y`{_bx&6$w!OyM?`T8c=F!WO>b6!Sv~8tt7d~L~{bRMwx3%Ji zw)NoCMYB|$8M)Fjll87J0%(i#C-w6`wi<8H|Ji}hD|6($(0sCTyj zn;K9z=Dz5fU+t^J-sldG-$~+p^BnAn6-jN@WwGFK-@4B(Q$8#7CiuMn2sWz_ zeAeLcz0r=$n_SRv>j>iKL%R~E{usD^AaaT`e2dL0ap@wphzQn?u3pWVMAf{PfWr&4 z$cTBGsbt31%H%`vhIQH??~%#M_iJ$ODe&wdcq6nd{3G9R^;dSD-RvnvFF#rhyxi~z zc^<(tbY9Ym?PdpkiiI~!qmDQ&qWLT6<9#l{Me(!tVe|a0F{$lq#{LxhdF2c_H+ZZ# zS7kRit!aaaT@&kC(q^m?T5?ZjZzelc3b)TN`dB5QSikpZmJ*+-gd zYO4JMu=t10FAVUDDPJjrpJ$}c(LU;2Y}=wnmAgl3p_UJ_opQ z1NDsI3ghzgjWYN|Bz$6B?Lq1bgC9&XMtFU(HMKR=ml<8*Pcsk`AiAb@Lcb^B85OkS z|B?6R@lnrX8c#jDVkBq9KJqg zq*cHi4+fWV;MY%+pZx*+_15{J@EVUV-?-)c6yw(OYYU9h)6k9<;2Pf9(fB9ooPw7+ zui#vG*#d57Z!I|Q`}gkB?nt9ca3Ve+AGU|~5V-2&KMwafG29@QqwZo*UB1*=eDqBzVM$^Jrn;7dS-%e^A+Bz zk)gl`*XNvxje3W8%lRzkfjsz@ig4=!VANX#uXA`+3altQfPVpVM5F z6edwO?JflTIp9C{qn|p z>1P7{e5R_Her>#Bg?%p_dODJOBSK%O+RVKd`CkpVqPf6F(cu7oS}Ml)=@htU&(Sra z8`$%)yEG@!XJ9Xn_tbsPz6)R5kI*mm!#_VOs_wPt$6J--NMw%{|NN+yljM-P581nq z?^kdP${~@Q2kA&q2Du*@B%?CKl>8tWWFRug;rV^cZkO9y`@&r7C1^zA{19`q=9%?A z`}`Pd8TfV>+2bI!4vs=c*C2Ofys!B0MgMj-kNq0?-%CrMwEw67tf3ADcKK4tEt6b^ zEssd%`EQdoSj(WSu@)G}cYRsj*Z@7;Fn!^Zq0_H@Qu4+F^qW^e{Sxyp}ao zO^_CN(A&)Mn_R1wM@yc3iqzpAI@aa^EzpDt``T?y#-DuQ4~5ng%TIr5!T$)s3~*G7 z++qO>sr63bhVZ_7k{!` z(qlYc@sTq4jbby&uj{BDf-YKYT$@dt%Jio7-*xxjQMG5aF`}7oOO|bXN$}JfGuj}x?&Wzb!HHk4_(7b^&#g!hjmU*h>ywPCpDkIH0<|;TxN&GGvsWF~I z4-Jm7uRX@3+B1w%F@FYQETblkh0Yo@0y_{hcg5m~png zGRB_obIiAX6VGd<@A};V4LW&|I<53uGQqX1uVnijxkl+F`QgPL@&o6R9|m!hoKS&m zuwr4P*|2G(B{@MdLJ~5`L`P&}S3t&>?z$Bjx`uVQ5&kYYzmxI!W&2dtP~%IeUCbK3 z$ofgXmt2oLZ`VqH+Lh#0JiBE3piHltAd4B>*4(-F*d){2zae|nBFjsrC}3<`7@Oqh z4A!{dhlZ_lm;BTZ^N}3I>jn|ap*=Ar|Fkj>bAbD6E91=%f&Cn3bYl)Np%yTh!(Jgd zj_Ag5^MP&Tp_|Sve9i;x*D=?#!36{Vo5Rt|dzC>AZ031Xrm?EOE3Gl7yYN6HurV|u z(mRkCSj!RLXaO6WfQK{SOB}c%S+WV3_0r!K`2A*{Ke=g)g?_fG{gu(yFs}Cj8|Az^ zlh)4p8gS~+SY_*7yDsU;rgZ3pf%c<-Vav*$|>&!3ySEswP^={+-ylzn0&k%{qBRuHq(0e&r-QEvaf-DCWsBAUH~g>$@b2k}dJt~6g0b@^iH z-(5y8?<6C}d!OO*e%o+(E&5~E%4WZ_Bgpy3knzue`=0|h)N%N?97me>ByoL$wE8$wQT2K1B5&Cqm zO@I5Kqa~O%&=>KWPH6Buj7>Px30#N<6C30`PfY9<=2P~TGtm9%Hb3nPEl%`G|4BRJ zqGpl!C3Y6ZAbX2=&JM-~ty(1-E#0Ks*~ixeJjxccm;b8fis*DQzsdGeNsPb7)>-H0 z`J&VIZ;Vs2g6MQ6aBwSQmpoE2!%v%YIR^ER#_3;KKj`yd_^IggRnX^s(C6U#iFU=_ z6ln_Q{`xxhseQo6pL_G%j3>W-9sM}?b?YcN6yLGe5PqsPR4iOIW9+N-3&3J-`DIva zoO&4+D>!cf2EPyN%_MG4FxUtTR!Ej&omK#YmC&el*BhlVteNz$Sa?i5`{_holVYkq z61Bx__z`Uu{2T&))=us&^^Z5IR9QP^%enBg^Bz@u{@M8hbec*+k!zV5P%gwgEu(x-+vyUw^J#^Z&dL(k;X7bZ) z{hBeU3ohC66zjb1ydPF0OZ~9g zIMENQt=KCx4(YX8)nWzZJIlYIvK8Ad4dG}JrH8A_~P zl5ffczdUe)_1nNay+S|oD#?4O|^`<539}tFXNK-4RCpdohmsgHGqFNlTG`>5usMad_WqWXJ~kD8Bu5eXNGYY@(0Bz8yE5 z-k&lLN9jwjfAJBC@S9j@V*;=gTbmzF-RIc9-M{`C(^8)=lO%CUI zc*m4W^LyPk*+-FQ?fF&z)%Ru0vE6_A`7V1`X^uD9bIks9M!bKXkuiv!Nb0GlG!HdK z*!G(|KP*;ojAoq$htCj4BfN=-%J0f8B6eqi%e)ggZE)2AokIq>S&fcqRrY@_(g3Vf0FZ!UfVc3Z%|z0TAU{AgRp@!mnetz_Yo zLyfNQ^UiNCHC9~(d=HM#f3EEYH?@Q#ye-lbQzN{JHEwbF^}+L73bcRuD(Qo*r}Hi0 z9D0&hvQ|StFI;=QAIHvbaagUV4NLmG<*jaNKhJYHk2WX$>C)^zBr2Y*cKYh$h89vXiOe@8iP zs=t|<=%YPG{g`M+eKB>>skv_BZwoXwkGS7O%$XaTY)jo3>+9A>`H|AM!f1cPgE+zH0Ox@##1J@PrkFhcGPY8q|ZO%G!}ds zW-Qo@E$$roaU;V-(PklXz-_-e+s z5tvAa#!A=u*8ogPem_P357WOU`g?@DNy*9i$jP?vW$xAU^}fP@tSlH0${Bflb1{7i zcSb6YjXA%!bD!3X@2o+l5st||ZQBl+-*rztVXb8?UgDc4`Q~xv>H>PmMd<4neD@Tz zwkiST zCO3k%i8hY*B5gKp7;PCXh1Tr~CEg^4xQSTeC*p{s=tVpQu_p84iTMs$b(3OER6qWo zsS7;MD7~f>zYlSL?##U@&um?JM{D}p@1{E$Z6z4c31a;K3&UP~l&4PRx@j#B-k zQTjXN_USRgRpfQ9_aMK|v~x-#eAQ3zOcHt(b}+@FWOAKLY=k`~uB*StH9SZ*(gK~6 zBbm`n13j7YuO5f_>6jID_M`O1{`=Z&p-j@XJ;*7GC$PwmiykD$@* z*!n&i$Ps?~2=#c|b2*Y{(++)pWB|u>qiZPhu&)<11Rm^TOclM^zmppNk<6j==?{q~ zuEl@+xF^E zVM{prqtnya5{V3r+(}MNSF_7q(p+L}Jv-5DiO;B>sP>XN!rE4ya;>iZ=t)bwu=#n# zQ1-3(CA9GHf55XQP`@;GPgN?i#Pri{Gco#iRli4;7`wht>fV*)S<{#1y3v;QoEe)H zVYXriZtjP?>u`G?gP&w0htxo4^)1DJY-Nnvt44fId>C8P>@H|Z9q~~u>x-(W+c28Tea5qZge`((j`cw^FYke)>nn{jX5_74&7H!?a;omo}j{cl*C6=7z`yUQZP8~8Q zI<=l}pJW`$!?x&qHhoau^>TQ!L2P^(`;BS;&M7_i8>?|)<7Ph2P8@5E%}O+z`z4?| z;b-Dm>EF#A;Ugq#NT-*7RrC3E>)1q%?kk}?IZ6I_tHqOuw+8mdW*gVSzh-B0y@xrN zVdtNDe3|zerF)sjnW6~}Xuv&2>FxkcI7FOHCfAL?%}i(l_F`Z4-Q>8203a4JbeSO^TF)i<_dfWx^9ElJYLq@+)OQzmz`nW9~xoa?N{c{Uo3ezyD4q$ zmBxJ8c*@{s2Z06YN*`?Uq^{uIs~7Y#e}XM(HM+@a^0QX+{AOnejT$2Gy`MOHH#Rug zzZhO8`m+K%s{gxvz2Kg`|8F^RL(e|zz5P?7%ne40ck_WKtskJnK2{dRHV!!-g zc9h+B`AIehc;q5<#wF;CFD0^XV;XzMC5#%MnQ-&?!i3S|%M-?oUmKEFzb=ZscbF^T zpm@+{D?8~&T>1w0&;{fej1Ai&w zdS`(4O@^*D4TW9>Xy0O;2WVfW&Y^vJX2(^W3oiJ78RtD|-}WutrHW5&%!MvpGs@by zWu*1sbw$>!n?_mv^NXxu#wcqbw6R+@arEHhLyX}K?0vC_^U27OTI1Ty0s6)Zf1}ik53_K7Wa`a^KbCg`vAi!4%e&MFjjS}B6%V|= zB;^qEv-nDU;LO=#;%(Ppi+pCIe~wBx_G6BAF-Oaw^@@84ht`)dN28(XFQ;E`opB|U zJji)9bU)K@tg22OVTsm=#>;-j`+V#+Rcb~xR!xJZANz5n`KPG8Rp;j$=5M&(L`=oG z`B7#ybM+Q9zH5H0xosh_p3tpb$iBNxPwH;I`yRA^H*+F;)p6*gY(cxRckO1csom%o z$BeMXLx!V6wY8)xhB=mARR(&6zXufm4@__&ORPPwtJK%Wb-eCu{^W(4sj#qvhcEIu20b};(FRVW>wzB>=x8>J=a!)~h#=(s7AqTG< zUv@Ba{LKgZk6(AN-}oOM95DXVgIVL>J2-H>_GHr@Zj0EH?In)dzilz+?{d`MZA&;m z$}zZa+gfaR>tdE{+-*2l9bW`}gqKy*Kh?RUCcG~n8L5Ub3swf1epvBU`C(;84_L9~ zf`B}53$P(sK=9JWd$NF;Cc(?f5mrxFS^G0TtUL&8>;hI60xQ4o0W0z!eKOy130{8p z6?oaT(vw=wT0aQ9l+TZ5j-9|q3N0R=aX76Ht*>{N;c5g2I+g_B#Ra_dT*HIBBMf-i z1P%tDAI&f^lzTl#6@PJo;1xL!ZoWvb7Dn*qT4dSLYb6r8NR z;->oQMAkmVs9&GvsNanJRxnTw49IsS{0PFpBJ>^Ek^ivCwL0_X?A6HeRoW=pJ+y~t zi)gE98~Htl<6w@Tyy04%$FZAZ3+>08f1h?gZ9MHT?S0ytv~Zs3E>a#zBsy;tx^FZ( za16R|EP8z$wQfRIWue#4lst@HzZ@CXuhYMu@^shAJ6dd=-ZQg$hYYTr|P_b^C8XOT7WyMBmFphl?xi=BT#_$Mq1?hWzv1?Z>=$ksr5gmzL=k|T* z3Lm0Byy}RFJcO>W4_!e#RJ#15?-p58`JQy~m4)mj9MHp84kIrIU49Gi`+(Ng5NfS) z8nZi!!mT6d`t{_?WIrqYEB_Zir9Kq3Z=P#TGGyy*Y)0ol9M66>>{)#{e9^b2qt`d1 z?`yB*X7qH$htvRr*|z>1X--GxDMKd=)>vAEPG|dM8*a9`S>x)JbFGe|F?<)D6`8j= z=8sj>jC?-(P+!v%{oATB}@!mLFC%=aKw%(82beMM^Dl|Hd z5Yr)lb1nGlD?%U6*6`!kjH6t_a>13Z4>=GxCeJyMj8O)dHiEQ=3yzpOue-V9* zYbPr&=tkfU_*~AKy88!oRqjm(-g9i3ix{UH*n5Tb3d&uJziE^{0K7}~YCzs{a4nb* z>Yj^7l7kBD@&1kiodfSWZ@Qjy;9cj7bq>7iJd^XD^4ePV=Deh<^4ld{bp$zC0ll@F z^XRwob0mY+qOU#zy!n7PVs37q6}>N|k^9Q+dYF3+j@XKIZ>&tIbT}*C|IE*Kyd32# zUPL^}LvJDr0+U0xBMWlOVExKyh0t;ra*HFxIX(ow^O-BOWB^C;9lk@{veTFUDSPup zzhCtV_L|O(5oTiS-l_wfpCOmwiG^|Iar~5Xu&V^+lHVTjq^j0^pUPM>xiZQeN!+Sp z?nV-$mF#lR!fEl|>s`r>=`Q3Fe4>gePj>Wa+&sT`Dm7C}h7aM$H_rZZZ*v5(h99Hr zN#`RK)%*Odz0GOPaO9jM??Cw05%g1M_K~zD1C05K2V0EXv7hySi+3esKe>KLa;oZR zcK&l8Gnsz;44LSE-P(tKg?oPo|2R)hL%-O{`YV%mZtRy(I6f<(X#ADvjQ!CWow0fK zAxTR%%9kiO?Bc)a4s>(kF6D3VP#>@byIdFb*Um4Tav=53$>wR`Uijo@eP&E{n>paq zAgvYiJsA9a+Tm$z!_N2g43DYWcu@u;|y|fM8v^ta1?p=Zp2fK>ks}g5KmgC8Q2^`16pw>UScDMT1PV1Dx?@N&Fv~KPp zDdyPBWb?|*M0+1c7jW&@FH>v(=)*qRV@>2MY-XLbCOcS@eAc8VO{#`3_oPXgT=%3& z)xEz=lQOvu(xj#Y&I2@QvCacDDO2YGnzSSS%j;pF*9U1*6Tby%QZ?s6npDR5KSGlR z?Fh)i+b_|iU;J&F6!KST(%N7CpQTAoWZ(ZPO;UXeXLdi^?-ry@1<)qdZ2d=Q(^qw! zAEWauL+5#eqw0e6gLe+V&YT7R9Eh#?N^H+pe|+`${@9;)2lSbp=rhLx`phoQ!)9j2 zNC$eC^8p_Zv;Vh&>tAvGCE5pk25Fya6{$w-)ySYJv=G`4kV*d+t)8}n)=K-BwiG$` zVOmh8eUN*%(Q>#)K>zC7X*bY5q-~*9)2`w7f1-ucx_3E|aYK;>!;lHXkqzC*h#q9z z2y&&Js|MVLJb?VxoM8-C?V^1Fns&&64+`3|2b!kZ50YD}p(}gg=kiI3hGla7J~S=p zld4WLN*~4^T!HORa^`&G%oo?EuX+RY{DJHNhnrY`H=geZ+XORDIan_cDTF<I+p#kby^*~(Z#2wyhs#^Z{5!GF?4<7PzC{JrueTLh zJ6KoE<7B?M$1qAi#^0~~JY%kp zHS0%WdxS45&;3ZW;m>J@XH|3F7yjh^d45g>Jmi_m2z&<-J?yFZ&u_sV?sLkf8c{Lx zjc4Ff5f#PYk8+smH5<9r`1Z|E%B{fhvkM>NyhKwIw)62md^QS z$rgT|)d=s2T99hqx**N0pdO&&fZN|;ZGp$V$Qvi&(|f5MD|zE8{FD`rXljeZczrg{ ziZzc7#b=foX|^Jl^iw@w>{xbPLEhu$zajiL6FX^Ez+QR=yK0vmqlFEWScD722w&OC z9%1&m?xhf$p#LP)9>Yc(ltb*ePUHdcylQl%yP+dLqE^eRt=+??@Xa0cr-F5?yw`Z6 zZrBR);*9;oQSL{s-@kl_+icGCm`?nHet)~ZlU6&wMc>tV65s4?e^2qh--doXN?S%7 zMZ1cYOzU=O-}x}`AsoDLgC8F71bQ+r5`VaJ)lg_kGrGdqD~#c8a3feBz>iZU{I{5A z#_{Y}-Z2K88jZi%W$UQfz*0J}u{e&L=RW@P0?w~SR%+f*U}du&E%4uH@cZabimVI$ zndh`3EB>xg)&T0tUj==-_EeE|J?pRcsNO>ZI!p;R2Js{1N$kbOUxQD#d7#tUE0}P_ zG&Zxx=$=30+XGI{#D`i<C3<(x?7J90>Gvt&rT zc}ccGhI{$}Kb&L?2{(P{50OJs`A?|XZ%BygmvNGjarz?TB-k>JVB&LpSCVti;)6X6 z44fmL?}8mS7fT*TT&=|#og$vN9e6m-@5flH3-*6mx2LE%d|;II8EdCKdXFNfVuQEk z)?mJd>RAe2*0Qdek4A9l5Vj*PN9mmJGbid>8~sIJdhVnvrX-f}U5(yiphv{L6k*0; zua4uq*TTN0V_~T2To`ZeT41C;YQ&ZNdO%#Ni{E}c+GWP&C!3Gr4__7HGJB0mHCK_Z z(d2fSMo4e-FgEEX#+HE%%c1y{>#;qLHnyoQC-lj-WdX0$5R3L@vSyFb-z;D*nsyG} z*6otbCIYyL1b(7`qiEnM2H3-Hv&*$gaN>r3E>AXwUxGu{7}`%hl;(BAF4p-(9y}9OCId$hw|gX1o}RnG2F^nZedI%;6gFD;J4QV{ZwUF zKr_LGO5iz@^S(dJpD~d2x^qFed49hi2O=IUveK>^W!2$N&4o7BE$!2p3vH~!o|6kM zD0fVHLME|M4bHy4AgPs(Ncq+Ug~mrqJD|8c;Qa?|}jsRYwI zB+VQ?gm@3+_e;JgSHKtbU3^hX@3nnVTOA&+yCBs(13gq+!+G?#3-mjm*qjUwJL%=>vd@hPs}zN4||cS+Qo1eeZ)7c#a2>kIfnB=}(p zKQuq!h33YGEi#A~^5H}?jxCHMhPhb9Tnzn>aPt)OAoiDu_;Y?$W#G4Q#JpRTxjn=@ z!dy()My@mS;9Tf5jcb$43?mKPNr3P6F%L8Lj0nR#<%(;(8eiSc@5Y&m%ZhV4ZTuM) zmTKNU#%bfv_UHrbg-!kweu^E$;$_S%+txILIB_@lGc$9WVx`g@F}CkUW7Rl)_=2Y~ zp69@ucW4`E->2PA`&)R_6L)Od`~PA5Dd>Sej<4WP7x;4?9uhJ2zZZYng+E1?@rT4B zOS!)Pz4(*-H}U5!8-G&3pB{c(@TVVs+7Oy-A3f!gWAKzv@MoSsAJC6KMSqDu?ck4O z6X6d?{eJ;}lK&3;Nim20E&NIT+xU|UzU0voXkWbUT7721FX1=TR3zRz7g=XIvd*5^ zFUvaANhG$e0Q$bu-UDEIKz5lCkX^)sBhh7Nfmeq(uY*o0Mk@!}JrjCa@lW`V;ltv^ zYrbW4xUunx=Cl)^e2O|;9mHE4CSJ*K<1>VR4kM;Sc017l<^L8tBE5&;p^JI1a&SK) zChbR#*v6IAJXbDY9Cq*9X@_~A>U15UcGaiIhEok^<79lDQ|Z5KFN$M6LVnh#$RJ0F zKg|yzJ}ZitkUwV3%KT#{wKMxsLo;jE6kaUvm*oC43lC%&A>I($ zqEY_-PQh=5K4SM*d4F-_T@G_bLd!kGY(|pv=a1*q{wTN6$6h`9>83yaXCisVpQjmo zW)XJy;5g$>5VLFdr-g4$VT@D1W{mlav9QM& zHLv^rr@_N^zdZH-Iy`h%&&s3~(#mOnH$2ea|55NjfBq^wd}WMZ2M?2hhwi1axr89k zg(BAx&vSP;bb$Rc=3#d!a<0mTW_X|(Psb4R${yx3bc7BlenoZ_>BZC@?vhL?*)fyr zNMzE%$d%HetEmYe1sxfJY`I2qGS|`QS@qa18ZP+)$MMave0vOi7)@VpMz$Pf(KlPq z8iJnHApMBigwe>BIxpjVCU)Us?B;66`!X?dhWWWs_Wq-Z{h`okhpWb0zu?6E`H?IeTKNnN!}^B$l%= zJ=0&qS^d#|mlx36FT&GQzi90b^DiEbln*fKJDcd+;Z)<#havhhXV)3PgM9o_C zlyPw$`PJ3*?cyi-Rx>iG`(pQ-?;scFSIpbCf&WgKdBZl1rQ6xtCwt^?8AAr^K8|(` z%}x7{*S^drRF7~EYo5-0W?;i{5BM^_$Rv)aC%@>aH|O(>J*>BJ)ELV19oUyj_gV!O zYt)f#6`ZdlKW968(WetNH9nrwy+t`)-~ zUS>Fq1wF^~503$!G8MV)?-~O%y^tExnb}i&jG^p*e+<-64vc|)O)v9t^&vJc|GBv7 z^PamMKc7J?s1Xv|m~R+f&9CfYnvY?`a2kO*S%k04Y0tOvHhNLg$RDGs9EEb4;?brs zuc7=$^}&vlW1$>W%{}Wt?p-7?CXwC-gZfkaPiuCXwNqZHD2Rn(i;L?_HU;%>Lh%Y4`KY<@@@tp1;z5-oBr>bE9$tm)Xzr zfBScGVffGe+$*==mz7pzzvrvJ>-$%xDaWwfexCnb>YvWlFa4)q+B*CBul}yzveMRT zZ1(g0(xzVezRvy6P7VAvX9+mVJU?=_o4u35n48$zsl-+V;aM@aqCs`ws=bC$tl>lR z{cDJfVz~dt8dCS|A6vt+ziSPbG1sH1C;lFF#8prHHjcBo{zh2xcT|sj4%e%>{t4Hr zKmJp$w{YFb^#-oDaxIw8%zl*iAZ;40oHmCxoAzDW{j~4V9-}=@dw@2V_5|(8v^m5e zRAUQx=UtT(l(=F#I|$r#z3xOM;W+&_LZCc zb;{BMwO6n&D}PY^&4b=57uPqf&{C{HGd2v_`_5tmY#vxZjVL!ZcgD#$US`d{!(_mSj%Dvn0=japdK$;|Bz=D(HsxBr8zq5o+9hpeLiDP}-9 zrdKfM!CK})nIPN!<+)UCmtZ|*a*6zPH=tdD{SPH4GS&vK<%OpPBtu`pm(!A#cTz z*WAwYl8|fT<=ORu23!Fxh=(R5KpT2P zPZIG}hOX)dJsAuANS{H?d*aach8w@m*&4Iz8S1^~&Yk{bG5OK?)O#1LX&z?`S3KT0 zS_F9?W2-%;dmJ{~DBeMPD0)wox87}5jO8A3^?kI5qyJDqw= ze5Nwge2ChC3D{lJk()lf!RUGk8}Mf%jIJZl#@2YFtL<*1YbSnE;Yu`3@?aA_pa6#1f!`;r%>Gb_*fbL0}$70^DU!Y%n5{PMn0D_uG$>uXKruS%u{UAHsLwn-}2 z`XDf|89iV1N}9lP$&ts2M^eqcZbzt1%gT{=Uc+9heB5P5nzzzOwR64Y6KK1e@gpy_ z-6z^d&1~$Oc1}C7C9)kTw_Ub`21j4p*CCv25-t);WB-mXN4R(d+$}(NP#smlO3n4g zaK*8Wr40fu#%;joH^SIF7&sY6^8!B)z2Pwj>mG3OZQ8TI#l|=Knk%m39&qv<+K;0D zRQ16dX=dASWAk6eX5IxX-w8~Y0^1XT@jKvix8qCp=UEO0eu(4j*m129 zjsPqPwl2d`FJxM0tsmCHd%#lM*T9my_MI#9pS)q9@uc*|%dlj81(srgCFm78iC{@^ z5J!xgV5y-8EY%Z#bR0Rn9+*s? z4Rc1I*QEItL%&2L1y8LuJn=j5gkRc*r%d4KDd?EScpbL3I3uxfDEh%Tn({uoz^iv$ ziJceIjjmrV?cM3eHzVOhoxd(aS3|zl?ua19kNOQy;p+!Zr=0Zr=GsR%;4jhmmU&M& zIE+r(8tJxr0lzItqiy-19sR!D5$gSvKDQRZv+(P;78&rPP;cv)!1WlT>mvLs5Bv}g zWB?!PuYA_E+2jM|8C~tzEIvbaR&9zIZjWg#P3-jWofFp-+w~(pn>Nnc01l2MXT61a zQ$G*G+qB*%CK_Gx=RDn?+)DC&TllVOK7TgNW3|vH)uX6={z;oxZaDq~?+Nk#vi)%@ zhZd#5BZ`q!9YWQj+|IM2S0$?;DvKV4c2 zENLts438&XG~R5do{DOVXn%O=ZGs!+``b0#sXf1)|Bs-q%nObr?>T2IHF5p6Q_)D( zl(~pL*9o0G2R&Uxt&=6xIx)iY>YcGmHl9P?E)%Wfx4Zt_{bpYysj(9~!`=9`eoHgP zFZ%KEY1I=v( zrVatC+yCGn-|-t9$c!M}jb=KKg(u@8Aeie#hezLMW$bZE!ip7O?bj8 zB$wfJY8D=+PQ@Ij3tM=cSI=x)dS_=IzP)^4Lw&5v{hsyx>|W-k_eNL|D@JJ2f`oLvW=&6vMlc!H=YPsmR?h+pRgTGN31 zCwB}moKos zUiW(8|A4o~z;7+!c@Hw%@@u%?JHP9j@K;at@2j$_Q_SXX7@Pl6kGzk5Ork&c(yx2y z-`()nyYQule<`j+d4*&9px+^@WpZx4bDdScX@qreHM&K)0c<6Aj_r?r0gt=!gCc7L zJhK|u$wuy+ha4u^^ZY=A8ZW&&RZ~>{b!T#I`_iV6Ro?ZwWgx>*^>^^OG# z59Aznv0aaUkMlb0e0qoYdKeXhRr2jG%F&78p|#YTR{vk!@`M$MKYcQEtPy{Dx5maX*#BJ44{^^&?w7_<$FYIy z3XZaA=5l0@zUOinM=?!uiZ2P8rQ>9yt1%$Ayvp%b;^pg@mq&=>+Rwfn=>^m;iifY= zXOI)yr*jE9zho8F`tkeHJk%gu^lwe@Jn_N~Xz3B?v0n!6RqMCOh1l|o_=&%cRRXvd z4&J+K*J2-OQVo1;b)waZpBL_lzsSyA=_H4edGp)x`eMiHgB>r?u7f67cLVdYjM`$7 zTd39MTMT`XEVJHqC$&Re__)%%>lsf2G4s;niLF~Dc_kD2abhxXOnsLEoudhVUa<9Z-j9dd>l}4n`U|wwbk-ByM_s4mC zyP~Y7m7$iGnpb`Jc?V&k*FU?y=o}GQ4XV0;&f>Vu6-hgY9NPEmc;ZpN1(*x?&n z8m_l;sdKQ=_dV-p_V^@AMS5SNZ}qO^&JOAnijN7-CgH0YM*gbmrk&%M4%{AQ9aJj_ zLE4N$t{4JL`)m^%u|<)h?IJgxzq}ZX*@mmP$?17@HqGQkh_u!zYW7 zQ&)1FKog7~88-2JIrQ-}Y`E?CY(7PHT8lq+9sby(>;-!4pZ>BY$YFAwj*#nAj=ozB zuIU;4*PD+3-&)gH{Ll|z<0UuQ`zz#O>8fRlJp*T_;QM9HY@C%Z>YX~*>Z7!MwDq)S zY28Z|a~1_piH5htz++*UWdsM~q&HlClr zd&cv=Tj5c+;FI&md#Og4JHVT&mCJ@6Xxlz@UMd>SHv+#m5g(lmPizKn)9Jr-K=Ig0 z&Kp@HcgUix;-$^$)SiLn%T^f8{j1=)(AthV;6eBl%YI6=1D~|6hWBQ`1N*`IiZ&Ek zqd4{_7esh98h_vz^uWFx6F9!kzD~;>ag8@&JGzeRM^DUu?gQ%3{SJ9F2YzXC{>2=J zbu8X#+4M9Oy#P8OeWh*07+a4x>MgWZI6}M^h^cM~_uI{mT^MEUMvwm!y1+b_)2rHY zFLG>#o;1*Z8#Y;}0TeQOunve(A&X~XL@QeZ5uz#(przx&cJV7+tjgFQ>4CER| zjIRt>{R1$k`guLyc?4hJU%qqU-*~6%(mRziE*^gZ8usjU@#fE%m)9H~?{G(|H;OtX zZt9feP+v}XE*`&`+NCS#SFmrBT`|73%=N9{b3u8ic>-EKg!-us4!;cQ;N6lzPcXO2 zrIP-V0gm%hgJZqvkXmDn1$1A7lU9E~HuLkIMI1nn@qGXwmC-`O(gzX$sb zP5rP(zcYc8%l%Hf)bC9ANHntKEy$89kR>aIMVl4K(24~pAjjgy%|+IrXlkM?=xd61iB?<% zzUG3%55v0@_v7GR(67G(T9NvTA}g9_UM0r88`x8RMI6t>@XSZD@i=3Bi?QPjh2~t1 ztlNw4M}4E0xq|O@JFvB3FTDEoB5Nr0lA!`4eRRQ{xERUFjrmm4(3hc_}?gsC3XfGgVR`AS)*IZUN`6yGN0g_YZ<|LX` z0~1Ym+*N6Fs1v0+;EFrwM9;ebKU@U7%Qx;Juc=nBUyzx0=fAU;cQ(FZTOnmP+ho^C@!)@=aZRFf` zUU6lLBfg6FcH$d3&m3LE7jpqRG9`S+jjAP~`V@;fZ*sZnjn1{R4i8*p$81C}SBHy; z*%-L}Tj88HW%*+^4reX;mhKt&gdKtVj*x^|O_%QX{mz9yx>kSsR8hSnr0=XFNjyic zSN_F}Sxu?izm-WWTmHvu$2&hDZ<=_v{P4v$CR59CF+B2Od4Y9-xt4Ej&$;e5Q_d|s zuy*G6FRkMYGT}}HLt-B};y3q@| zaRqcE9vLA4S)n&FLn5+65;8|JzSXc*LnL!RC$e*m;TND~TfiUrJETvR4MJ}O*QYbT z7BZrE|Bf8)f$P)Zoh``7x|hklZQ#~)kXf%4Sq}j_&Eyf({&OSIT^^_AUIeh*@{wWHw_j^@oOW2Jh8kV6%XKx3vSsCt z(*@Q@{1k1k6k><$uMw<2kH*%mO#I)r@U)B~SZ(Dad=yx`1uf#_f2foODIucpvMVji2&7r9t?P~nT zTBkL@RQ=M@&S%giYoLGK*e!R0$Fh?u=4Au=&WUp3(ddf@`9k+vp?ksmnI86I@3nJd zf8rHhR-6Ly3IjRb#&s{^7ZmSs2iJ4Bek>60@Hp2$<@%FAyu;79HnPjuM|~Qt=YHy6 z-EaN!e(L`H)i3Y2UW~j}%^a(5-Od!B;;>qr5slA8MtCEjl?BAp2KQ3W0ah0yvl)ST zw9v;z`xSem@n`WY{KbmM#b0<;Dzyw%6Va`>+s4VrN&S!&WoIDAy73|O>k8m0SF{z_ zaUvfUUZ0URm3j+>ZsoW7l^*8rH2O<`C*&nZ*wSm3uceAhkH0JOWbEg=R*&LtadSYNsS986Y z>wo21F(g~K{w>#i!jr$Vo9o|m?T;Z*{D|U76kDPg62*^vm!{Yf#gHg|MDZkwEm7>r z_i0bk=BBM+&dZRaeStY&+uB|F4r7xoMQ{?F-yobiz}M{Lj5OtSe~f>PALF5u3D9T!)kUGJ za-h%I(C6iYjp0`XX!9XxZd0yNdM&hBwPA!;Wn2$}E@vY@wL()Si&jIw2SJym-?c*b zbT5;8El+u@o;szUKHtfAOZomp`f&&SxgGj^8}xYs^m+Wl^w*}(&Cr{m?&yHdH&5jH zPCuQlAIPjU6@ z5Asnnv~aIu>c1Rv7#-Hc5mxI^>?QFAG&HgE1o1iTcN<-YfRB23)aS&DufwM#(1*1|U&!f|(IT2DNeZYDU4gkCleEVd-)xuqXD;=PGk9%`#4Qrq*c&Tsqy9-ou@ z7xYxWXWFqmi8f9B3^;w8|4yQolWK=5XH)bv9UfZ`ET2TyX#^+EzcSYL-6_U|y?Ws( zvZYcx0y=v>urHA>tI(Q+O=XQ>hxoQ-;7*Y4{#D#zKF%>0UCiA!j_+_h?Hgyk$rzL$ zcrMqCpNPF3;J=a&F0u~iN{E-?ImN?hFW9jwRZ|@ujj?IpsMEh^uaW?+{pEOj*Dhqd zvJt8N-&e<60UTM38=7kGTXGKjnT>yp`{LdHeP)AWm&|o8_wqg_!M$TIg7d1CE;toF zG%-g7%!TBEKVW|l%+xT}B>28yX0Ic)u?4x|>Ce9ZTne;Vup~IT_~|&t&R!4lx6&WY z?>YGJd6(8exd-rJ>b?jLTLf>DiAe_D+9Tn?g0EM?i7_$~&Ql{$`n=ZeT)M^+fn4ri zL#>s;n4bnd&-q-|=>UATp*JnJ-{H10c}EcD+|0FnrQ-FHY3w8O@;&y<48q>?%>UmD zd#Qo72*O{mrhcXPcmVz)fWPJXE^Fc8yYd<5zyr$$M4Q4LKMzl>4f5`uxgO##{}Xt5 zCa`h{Kk;_z@C(Nk|MA!GdjqgCvBtG}8tq2f0GcoNE$FxdI_`vSx}cvS(9uxnX&7`o z9DkN$Rrk{H{lPe`nn}iR)xNgnY}VK@i5R44KP}JWdhxx~LkQAxWKtVW@8MpMmV1$rO)@l~+uLn1Yxp*17dh_!J^(@FCc*W?NiyiV= zbbudl6#id?_nZmm-L5#>UVmZfXiIi(*&RFFT~O?YX!9i zr9-qmW%!>dLRWYJn8QzdCW}~_+BPXGmNBvkzTFgA^HbY*|85Z*x%gdhv0Yf8u|pTXJ>HFpz#T(Z~4C)U-nT_o-(*obkVLaoYYof2ppYr?a?-o8b4q{Hn(f{|6De`B`+m?yG zC|^~^j2pM*NJpf(nZr!v)-?E<^nZHPx9Gh3D|;{5Xu|^E)x7IlEj*WVrhBvU zDnw(U`yIDp6Bhl}x}1SNw?H%cAd}q)9ZrUBzfCQ^iJZR!E#CrtP>yLWGyvUqRm=K9 z>-_17_WARPZpjTYPUUs@7-je-A zvW|S|Em_6ZwvQgQ->LV#ef%+GY7c%O-dFDTEs2&MMxKx^QiI%A121{|__aKDwRM*M z59WQDz)ip8oHS=nzcfdvC%q86N=cu7X*IyGbOynoAD-^AkH5hV^Y_5hXVBJHf9qOZ zcS*kX=VS+IsD6K$-$e_x@5vnOu#=FpwMVV!rGNjxQufzy`$Q{?u~~^$dPFOMPuXf@ zix3SIZLA00?_oR<$oaYGIJ)14j34|hNFN{kFVM#a|9?auZ5s#l(Z?KiI}&``iQ8;- zL^f7{$G1R>w9jTPN1Gmfg*Ik=g*M9mBii@?_#~QOuNg4s$blv~Jn5Oh+}J*uX^#6t zVsm6C0XDBCj_5kcN~h;oo4_Xv*!9O~2l6rbkL)GA;f2;;p=qCChmQuvF6($-g+b~9 z@Vo4Uf;rVgob2lBD?v6?tamf`W7j4EpSpp=0&E2~O$PtutB_qqxSI#=I>571YKp(Q z!L@oP?FCx*F4?`~k;fB|$9p5YCL+ToAiI z=u+h9iTIShZ1Y}xBlH_PNp>l=5#(LoThYqAG*9!()LzcDx1;7N5;^$>>Jur4r3SfJ zc8gZ#hdrGmq3O4~qu)=NN!(Ivq|w!%nw!l@Mpp!K^>X%Ni9+sO&YqlZj?Dx4jthGx zYps22wz=5n7&)>P8Mh5ta>!7l^sRp{vTFK6+sl#nsMj8&=ZG`2pN}=VzS%zixjpF1 z&E4VFFjq=rHogrv`tlxlZ1WB1s{GcL6=umFbTZuNIyn?O!IlWCW1`WO0{v)NKgNpC z`XcL!FRNChbTIpz=Uavb&UvnRAnPz9a6N+G(24E;UBmep`k3m!7OZGpT9K>Y$FHP4 zBW_Z^lQc(ZkuA&@JhQ@0%}RrLaT{sA4}m8qxYCk4%JLNvv)@0`-bbh^y$~^d=g@>)w$`T!I zLk1G9l>hrAzP)wA;eZX7n9)GqY*|_bbF%^)Wu+tDjsX*Y6W^cgFnrRpg7%xM@Ga_l z>%G@nFS!yrk5!PLLB6YOCVP=j+O)w+&f7h+R+)b@9{4BQI1{tQ{%EUzh3Gxv%f~A6yLJtV1g>=8v_AU@bj?yxX!g z*>?7^cJ;J1tY1F;)f%*8Lp+5o`v`K;QRJf0$jbjWb2j7fMaZWi`*uDy9X~9vpK=eF z)cP-G{ofDpl-h>Jt+t`#tl}X~)5knlP|vRz->>`^@;O8?ZyyqurM=yPu-DG~$?hea z8FDmmoy6Sq)&$S#q~GwiJ)D-Ik~mv=w#*iKg#ikl;ayIM_(()|51)Vq#WN! zIr?FF_B`5L+V|6r6Q?Ghi1u68x+y>WI#*cZd)Pl-rY>_hu}MdXQCkKsze{|~GB-Kf z;Ih68&e*v88f)oh9z4+INamz@;3!M7r{cj{{t7)U-s`8QgD%n2h%eF8Waz14>?b>t zeb0c4iY1V4;dW6|F+e*FXr|F;itStcI@&M@+Hf{Ye*HxDzP`)e`}%LuhxXrmJ$;Z| zAzQM45A0-{J}3?$DDw*U#rqZeu$}q%-FnyR6SSSQi;uhw-*&*io$zsLm`w;lh6u$s zgH3N&_k#(ZSwYOV>R{eVZSNTNkH{k?>15KHl*vXECW~9o zx5fV`rEQC`)ybN5bF6eE`kF?OONc+Hl31B)og=fTCTAJvQP}>Ji`a%OMzuehitsyD z66b?&>7hQE^ukZgK^ld)lxMv`C^rw&oi0^NH zf}`|h>I=%h-2~6mUU;ED?|$L1$GG-QY*Wjb?{tqbdmZ{p{`IP-O&!0L$OEs70M!yD~^BnWu`~OO8A8-si|q zb-Bar`hZ=;i=S7#6YtoA{%K&dX`&xTkP{X$-qqw1Ou{d#xF!eVR9uXFH?}UWbp)Rs z@Y=fE+4eZQa%KDJgDuiO&K=A_9^=%QY6e~&Ly3Ht0XXmxccK_0*&h@K*}z);wRNw- zMyqjrPJb`b57{CttrtA*{PVyX^K93Cp9Wr(PqgI_^(h(Op890*Zpo|pS>RE5JT=i6 zKX7t}z9`N^@!W#h)5L{dfCsumjBBR@Ct=7H%Q?Djo$U6`9OgHNxl-*L1DY;b1Dm(C zHfqrwP0@?)aK@-sKyT`YDDPHvT9az)4!BmgQU9ieeQfQ#C16SOV)u8ce@^XsJMfG$ z=1u%x^CF#5W6TDYTj2k)&77Y&(Q2+6Z@H;;8ieH>`sM~+XVA|9