From 5b8023d935401072b73b63ea995aaae040d57b87 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Thu, 16 Mar 2023 22:03:09 -0700
Subject: [PATCH] Implement prototype for instant mmap() loading

This change uses a custom malloc() implementation to transactionally
capture to a file dynamic memory created during the loading process.
That includes (1) the malloc() allocation for mem_buffer and (2) all
the C++ STL objects. On my $1000 personal computer, this change lets
me run ./main to generate a single token (-n 1) using the float16 7B
model (~12gb size) in one second. In order to do that, there's a one
time cost where a 13gb file needs to be generated. This change rocks
but it shouldn't be necessary to do something this heroic. We should
instead change the file format, so that tensors don't need reshaping
and realignment in order to be loaded.
---
 .gitignore |   1 +
 main.cpp   | 225 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 207 insertions(+), 19 deletions(-)
diff --git a/.gitignore b/.gitignore
index 5eb1ff1b8..e388b884e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ models/*
 
 /main
 /quantize
+/magic.dat
 
 arm_neon.h
 compile_commands.json
diff --git a/main.cpp b/main.cpp
index 6dc9ae980..aee1fbaf8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,6 +3,7 @@
 #include "utils.h"
 
 #include <cassert>
+#include <cerrno>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -10,12 +11,24 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <atomic>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <fcntl.h>
 #include <signal.h>
 #include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 #endif
 
+#define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
+#define IS2POW(X) (!((X) & ((X)-1)))
+
+#define MAGIC_PATH "magic.dat"
+#define MAGIC_ADDR (char *)0x330000000000
+#define MAGIC_GRAN 2097152
+#define MAGIC_ALGN (sizeof(size_t) * 2)
+
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
 #define ANSI_COLOR_YELLOW  "\x1b[33m"
@@ -83,6 +96,173 @@ struct llama_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
+struct magic {
+    uint32_t magic;
+    std::atomic<unsigned> lock;
+    int fd;
+    size_t commit;
+    size_t offset;
+    size_t capacity;
+    gpt_vocab *vocab;
+    llama_model *model;
+};
+
+static struct magic *mag;
+
+static inline void spin_lock(std::atomic<unsigned> &lock) {
+    while (!lock.exchange(1, std::memory_order_acquire));
+}
+
+static inline void spin_unlock(std::atomic<unsigned> &lock) {
+    lock.store(0, std::memory_order_release);
+}
+
+static void *Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
+    void *res;
+    res = mmap(addr, length, prot, flags, fd, offset);
+    if (res != MAP_FAILED) return res;
+    perror("mmap");
+    exit(77);
+}
+
+static void magic_commit(void) {
+    mag->offset = mag->capacity;
+    mag->commit = mag->capacity;
+    mag->magic = 0xFEEDABEE;
+    msync(mag, mag->commit, MS_ASYNC);
+}
+
+static void magic_init(void) {
+    int fd;
+    size_t n;
+    struct stat st;
+    if (mag) return;
+    n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
+    if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
+        fstat(fd, &st);
+        if (st.st_size >= n) {
+            mag = (struct magic *)Mmap(MAGIC_ADDR, n,
+                                       PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_FIXED, fd, 0);
+            if (mag->magic == 0xFEEDABEE) {
+                mag = (struct magic *)Mmap(MAGIC_ADDR, mag->capacity,
+                                           PROT_READ | PROT_WRITE,
+                                           MAP_PRIVATE | MAP_FIXED, fd, 0);
+                madvise(MAGIC_ADDR, mag->capacity, MADV_WILLNEED);
+                ftruncate(fd, mag->commit);
+                mag->offset = mag->commit;
+                mag->capacity = mag->commit;
+                mag->fd = -1;
+                return;
+            }
+        }
+        ftruncate(fd, 0);
+    } else if ((fd = open(MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644)) == -1) {
+        perror(MAGIC_PATH);
+        exit(77);
+    }
+    ftruncate(fd, n);
+    mag = (struct magic *)Mmap(MAGIC_ADDR, n,
+                               PROT_READ | PROT_WRITE,
+                               MAP_SHARED | MAP_FIXED, fd, 0);
+    mag->offset = MAGIC_GRAN;
+    mag->fd = fd;
+}
+
+void *memalign(size_t a, size_t n) {
+    void *p;
+    size_t i, j, k, m;
+    static int count;
+    magic_init();
+    if (a < MAGIC_ALGN) a = MAGIC_ALGN;
+    while (!IS2POW(a)) ++a;
+    m = n ? n : 1;
+    spin_lock(mag->lock);
+    i = mag->offset;
+    i = i + sizeof(size_t);
+    i = ROUNDUP(i, a);
+    j = ROUNDUP(i + m, MAGIC_GRAN);
+    if (j > mag->capacity) {
+        if (!mag->magic) {
+            ftruncate(mag->fd, j);
+            p = mmap(MAGIC_ADDR + mag->capacity,
+                     j - mag->capacity, PROT_READ | PROT_WRITE,
+                     MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
+        } else {
+            p = mmap(MAGIC_ADDR + mag->capacity,
+                     j - mag->capacity, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,  -1, 0);
+        }
+        if (p != MAP_FAILED) {
+            mag->capacity = j;
+        } else {
+            spin_unlock(mag->lock);
+            return 0;
+        }
+    }
+    mag->offset = i + m;
+    spin_unlock(mag->lock);
+    p = MAGIC_ADDR + i;
+    ((size_t *)p)[-1] = n;
+    return p;
+}
+
+int posix_memalign(void **pp, size_t a, size_t n) {
+    int e;
+    void *m;
+    size_t q, r;
+    q = a / sizeof(void *);
+    r = a % sizeof(void *);
+    if (!r && q && IS2POW(q)) {
+        e = errno;
+        m = memalign(a, n);
+        if (m) {
+            *pp = m;
+            return 0;
+        } else {
+            errno = e;
+            return ENOMEM;
+        }
+    } else {
+        return EINVAL;
+    }
+}
+
+void *malloc(size_t n) {
+    return memalign(MAGIC_ALGN, n);
+}
+
+size_t malloc_usable_size(const void *p) {
+    return ((const size_t *)p)[-1];
+}
+
+void *calloc(size_t n, size_t z) {
+    void *p;
+    if ((p = malloc((n *= z)))) {
+        memset(p, 0, n);
+    }
+    return p;
+}
+
+void free(void *p) {
+    // do nothing
+}
+
+void *realloc(void *p, size_t n) {
+    void *q;
+    if (!p) {
+        return malloc(n);
+    }
+    if (!n) {
+        free(p);
+        return 0;
+    }
+    if ((q = malloc(n))) {
+        memcpy(q, p, ((const size_t *)p)[-1]);
+    }
+    return q;
+}
+
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
@@ -786,6 +966,8 @@ const char * llama_print_system_info(void) {
 }
 
 int main(int argc, char ** argv) {
+    magic_init();
+
     ggml_time_init();
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -812,19 +994,24 @@ int main(int argc, char ** argv) {
 
     int64_t t_load_us = 0;
 
-    gpt_vocab vocab;
-    llama_model model;
-
     // load the model
-    {
+    gpt_vocab *vocab;
+    llama_model *model;
+    if (!mag->magic) {
+        vocab = new gpt_vocab;
+        model = new llama_model;
         const int64_t t_start_us = ggml_time_us();
-
-        if (!llama_model_load(params.model, model, vocab, 512)) {  // TODO: set context from user input ??
+        if (!llama_model_load(params.model, *model, *vocab, 512)) {  // TODO: set context from user input ??
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
             return 1;
         }
-
         t_load_us = ggml_time_us() - t_start_us;
+        mag->vocab = vocab;
+        mag->model = model;
+        magic_commit();
+    } else {
+        vocab = mag->vocab;
+        model = mag->model;
     }
 
     // print system information
@@ -842,18 +1029,18 @@ int main(int argc, char ** argv) {
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(*vocab, params.prompt, true);
 
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+    params.n_predict = std::min(params.n_predict, model->hparams.n_ctx - (int) embd_inp.size());
 
     // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(*vocab, params.antiprompt, false);
 
     fprintf(stderr, "\n");
     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
     for (int i = 0; i < (int) embd_inp.size(); i++) {
-        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab->id_to_token.at(embd_inp[i]).c_str());
     }
     fprintf(stderr, "\n");
     if (params.interactive) {
@@ -871,7 +1058,7 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
             fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
             for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab->id_to_token.at(antiprompt_inp[i]).c_str());
             }
             fprintf(stderr, "\n");
         }
@@ -883,7 +1070,7 @@ int main(int argc, char ** argv) {
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
-    llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+    llama_eval(*model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 
     int last_n_size = params.repeat_last_n;
     std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
@@ -918,7 +1105,7 @@ int main(int argc, char ** argv) {
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
 
-            if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
+            if (!llama_eval(*model, params.n_threads, n_past, embd, logits, mem_per_token)) {
                 fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
@@ -936,14 +1123,14 @@ int main(int argc, char ** argv) {
             const float temp  = params.temp;
             const float repeat_penalty = params.repeat_penalty;
 
-            const int n_vocab = model.hparams.n_vocab;
+            const int n_vocab = model->hparams.n_vocab;
 
             gpt_vocab::id id = 0;
 
             {
                 const int64_t t_start_sample_us = ggml_time_us();
 
-                id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
+                id = llama_sample_top_p_top_k(*vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
@@ -980,7 +1167,7 @@ int main(int argc, char ** argv) {
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
+                printf("%s", vocab->id_to_token[id].c_str());
             }
             fflush(stdout);
         }
@@ -1018,7 +1205,7 @@ int main(int argc, char ** argv) {
                         buf[n_read+1] = 0;
                     }
 
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(*vocab, buf, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     remaining_tokens -= line_inp.size();
@@ -1050,7 +1237,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
     }
 
-    ggml_free(model.ctx);
+    ggml_free(model->ctx);
 
     return 0;
 }