Implement prototype for instant mmap() loading

This change uses a custom malloc() implementation to transactionally
capture to a file dynamic memory created during the loading process.
That includes (1) the malloc() allocation for mem_buffer and (2) all
the C++ STL objects. On my $1000 personal computer, this change lets
me run ./main to generate a single token (-n 1) using the float16 7B
model (~12gb size) in one second. In order to do that, there's a one
time cost where a 13gb file needs to be generated. This change rocks
but it shouldn't be necessary to do something this heroic. We should
instead change the file format, so that tensors don't need reshaping
and realignment in order to be loaded.
This commit is contained in:
Justine Tunney 2023-03-16 22:03:09 -07:00
parent 2788f373be
commit 5b8023d935
No known key found for this signature in database
GPG Key ID: BE714B4575D6E328
2 changed files with 207 additions and 19 deletions

1
.gitignore vendored
View File

@ -18,6 +18,7 @@ models/*
/main /main
/quantize /quantize
/magic.dat
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

225
main.cpp
View File

@ -3,6 +3,7 @@
#include "utils.h" #include "utils.h"
#include <cassert> #include <cassert>
#include <cerrno>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
@ -10,12 +11,24 @@
#include <map> #include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include <atomic>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <fcntl.h>
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#endif #endif
#define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
#define IS2POW(X) (!((X) & ((X)-1)))
#define MAGIC_PATH "magic.dat"
#define MAGIC_ADDR (char *)0x330000000000
#define MAGIC_GRAN 2097152
#define MAGIC_ALGN (sizeof(size_t) * 2)
#define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m" #define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m" #define ANSI_COLOR_YELLOW "\x1b[33m"
@ -83,6 +96,173 @@ struct llama_model {
std::map<std::string, struct ggml_tensor *> tensors; std::map<std::string, struct ggml_tensor *> tensors;
}; };
struct magic {
uint32_t magic;
std::atomic<unsigned> lock;
int fd;
size_t commit;
size_t offset;
size_t capacity;
gpt_vocab *vocab;
llama_model *model;
};
static struct magic *mag;
static inline void spin_lock(std::atomic<unsigned> &lock) {
while (!lock.exchange(1, std::memory_order_acquire));
}
static inline void spin_unlock(std::atomic<unsigned> &lock) {
lock.store(0, std::memory_order_release);
}
static void *Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
void *res;
res = mmap(addr, length, prot, flags, fd, offset);
if (res != MAP_FAILED) return res;
perror("mmap");
exit(77);
}
static void magic_commit(void) {
mag->offset = mag->capacity;
mag->commit = mag->capacity;
mag->magic = 0xFEEDABEE;
msync(mag, mag->commit, MS_ASYNC);
}
static void magic_init(void) {
int fd;
size_t n;
struct stat st;
if (mag) return;
n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
fstat(fd, &st);
if (st.st_size >= n) {
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED, fd, 0);
if (mag->magic == 0xFEEDABEE) {
mag = (struct magic *)Mmap(MAGIC_ADDR, mag->capacity,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED, fd, 0);
madvise(MAGIC_ADDR, mag->capacity, MADV_WILLNEED);
ftruncate(fd, mag->commit);
mag->offset = mag->commit;
mag->capacity = mag->commit;
mag->fd = -1;
return;
}
}
ftruncate(fd, 0);
} else if ((fd = open(MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644)) == -1) {
perror(MAGIC_PATH);
exit(77);
}
ftruncate(fd, n);
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
mag->offset = MAGIC_GRAN;
mag->fd = fd;
}
void *memalign(size_t a, size_t n) {
void *p;
size_t i, j, k, m;
static int count;
magic_init();
if (a < MAGIC_ALGN) a = MAGIC_ALGN;
while (!IS2POW(a)) ++a;
m = n ? n : 1;
spin_lock(mag->lock);
i = mag->offset;
i = i + sizeof(size_t);
i = ROUNDUP(i, a);
j = ROUNDUP(i + m, MAGIC_GRAN);
if (j > mag->capacity) {
if (!mag->magic) {
ftruncate(mag->fd, j);
p = mmap(MAGIC_ADDR + mag->capacity,
j - mag->capacity, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
} else {
p = mmap(MAGIC_ADDR + mag->capacity,
j - mag->capacity, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
}
if (p != MAP_FAILED) {
mag->capacity = j;
} else {
spin_unlock(mag->lock);
return 0;
}
}
mag->offset = i + m;
spin_unlock(mag->lock);
p = MAGIC_ADDR + i;
((size_t *)p)[-1] = n;
return p;
}
int posix_memalign(void **pp, size_t a, size_t n) {
int e;
void *m;
size_t q, r;
q = a / sizeof(void *);
r = a % sizeof(void *);
if (!r && q && IS2POW(q)) {
e = errno;
m = memalign(a, n);
if (m) {
*pp = m;
return 0;
} else {
errno = e;
return ENOMEM;
}
} else {
return EINVAL;
}
}
void *malloc(size_t n) {
return memalign(MAGIC_ALGN, n);
}
size_t malloc_usable_size(const void *p) {
return ((const size_t *)p)[-1];
}
void *calloc(size_t n, size_t z) {
void *p;
if ((p = malloc((n *= z)))) {
memset(p, 0, n);
}
return p;
}
void free(void *p) {
// do nothing
}
void *realloc(void *p, size_t n) {
void *q;
if (!p) {
return malloc(n);
}
if (!n) {
free(p);
return 0;
}
if ((q = malloc(n))) {
memcpy(q, p, ((const size_t *)p)[-1]);
}
return q;
}
// load the model's weights from a file // load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
@ -786,6 +966,8 @@ const char * llama_print_system_info(void) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
magic_init();
ggml_time_init(); ggml_time_init();
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_time_us();
@ -812,19 +994,24 @@ int main(int argc, char ** argv) {
int64_t t_load_us = 0; int64_t t_load_us = 0;
gpt_vocab vocab;
llama_model model;
// load the model // load the model
{ gpt_vocab *vocab;
llama_model *model;
if (!mag->magic) {
vocab = new gpt_vocab;
model = new llama_model;
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
if (!llama_model_load(params.model, *model, *vocab, 512)) { // TODO: set context from user input ??
if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1; return 1;
} }
t_load_us = ggml_time_us() - t_start_us; t_load_us = ggml_time_us() - t_start_us;
mag->vocab = vocab;
mag->model = model;
magic_commit();
} else {
vocab = mag->vocab;
model = mag->model;
} }
// print system information // print system information
@ -842,18 +1029,18 @@ int main(int argc, char ** argv) {
std::vector<float> logits; std::vector<float> logits;
// tokenize the prompt // tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true); std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(*vocab, params.prompt, true);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); params.n_predict = std::min(params.n_predict, model->hparams.n_ctx - (int) embd_inp.size());
// tokenize the reverse prompt // tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(*vocab, params.antiprompt, false);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab->id_to_token.at(embd_inp[i]).c_str());
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
if (params.interactive) { if (params.interactive) {
@ -871,7 +1058,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) { for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab->id_to_token.at(antiprompt_inp[i]).c_str());
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -883,7 +1070,7 @@ int main(int argc, char ** argv) {
// determine the required inference memory per token: // determine the required inference memory per token:
size_t mem_per_token = 0; size_t mem_per_token = 0;
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); llama_eval(*model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
int last_n_size = params.repeat_last_n; int last_n_size = params.repeat_last_n;
std::vector<gpt_vocab::id> last_n_tokens(last_n_size); std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
@ -918,7 +1105,7 @@ int main(int argc, char ** argv) {
if (embd.size() > 0) { if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { if (!llama_eval(*model, params.n_threads, n_past, embd, logits, mem_per_token)) {
fprintf(stderr, "Failed to predict\n"); fprintf(stderr, "Failed to predict\n");
return 1; return 1;
} }
@ -936,14 +1123,14 @@ int main(int argc, char ** argv) {
const float temp = params.temp; const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty; const float repeat_penalty = params.repeat_penalty;
const int n_vocab = model.hparams.n_vocab; const int n_vocab = model->hparams.n_vocab;
gpt_vocab::id id = 0; gpt_vocab::id id = 0;
{ {
const int64_t t_start_sample_us = ggml_time_us(); const int64_t t_start_sample_us = ggml_time_us();
id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); id = llama_sample_top_p_top_k(*vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id); last_n_tokens.push_back(id);
@ -980,7 +1167,7 @@ int main(int argc, char ** argv) {
// display text // display text
if (!input_noecho) { if (!input_noecho) {
for (auto id : embd) { for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str()); printf("%s", vocab->id_to_token[id].c_str());
} }
fflush(stdout); fflush(stdout);
} }
@ -1018,7 +1205,7 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0; buf[n_read+1] = 0;
} }
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false); std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(*vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
remaining_tokens -= line_inp.size(); remaining_tokens -= line_inp.size();
@ -1050,7 +1237,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
} }
ggml_free(model.ctx); ggml_free(model->ctx);
return 0; return 0;
} }