mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 05:42:22 +01:00
5b8023d935
This change uses a custom malloc() implementation to transactionally capture to a file dynamic memory created during the loading process. That includes (1) the malloc() allocation for mem_buffer and (2) all the C++ STL objects. On my $1000 personal computer, this change lets me run ./main to generate a single token (-n 1) using the float16 7B model (~12gb size) in one second. In order to do that, there's a one time cost where a 13gb file needs to be generated. This change rocks but it shouldn't be necessary to do something this heroic. We should instead change the file format, so that tensors don't need reshaping and realignment in order to be loaded.
1244 lines
42 KiB
C++
1244 lines
42 KiB
C++
#include "ggml.h"
|
|
|
|
#include "utils.h"
|
|
|
|
#include <cassert>
|
|
#include <cerrno>
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <fstream>
|
|
#include <map>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <atomic>
|
|
|
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
#include <fcntl.h>
|
|
#include <signal.h>
|
|
#include <unistd.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/stat.h>
|
|
#endif
|
|
|
|
#define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
|
|
#define IS2POW(X) (!((X) & ((X)-1)))
|
|
|
|
#define MAGIC_PATH "magic.dat"
|
|
#define MAGIC_ADDR (char *)0x330000000000
|
|
#define MAGIC_GRAN 2097152
|
|
#define MAGIC_ALGN (sizeof(size_t) * 2)
|
|
|
|
#define ANSI_COLOR_RED "\x1b[31m"
|
|
#define ANSI_COLOR_GREEN "\x1b[32m"
|
|
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
|
#define ANSI_COLOR_BLUE "\x1b[34m"
|
|
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
|
#define ANSI_COLOR_CYAN "\x1b[36m"
|
|
#define ANSI_COLOR_RESET "\x1b[0m"
|
|
#define ANSI_BOLD "\x1b[1m"
|
|
|
|
// determine number of model parts based on the dimension
|
|
static const std::map<int, int> LLAMA_N_PARTS = {
|
|
{ 4096, 1 },
|
|
{ 5120, 2 },
|
|
{ 6656, 4 },
|
|
{ 8192, 8 },
|
|
};
|
|
|
|
// default hparams (LLaMA 7B)
|
|
struct llama_hparams {
|
|
int32_t n_vocab = 32000;
|
|
int32_t n_ctx = 512; // this is provided as user input?
|
|
int32_t n_embd = 4096;
|
|
int32_t n_mult = 256;
|
|
int32_t n_head = 32;
|
|
int32_t n_layer = 32;
|
|
int32_t n_rot = 64;
|
|
int32_t f16 = 1;
|
|
};
|
|
|
|
struct llama_layer {
|
|
// normalization
|
|
struct ggml_tensor * attention_norm;
|
|
|
|
// attention
|
|
struct ggml_tensor * wq;
|
|
struct ggml_tensor * wk;
|
|
struct ggml_tensor * wv;
|
|
struct ggml_tensor * wo;
|
|
|
|
// normalization
|
|
struct ggml_tensor * ffn_norm;
|
|
|
|
// ff
|
|
struct ggml_tensor * w1;
|
|
struct ggml_tensor * w2;
|
|
struct ggml_tensor * w3;
|
|
};
|
|
|
|
struct llama_model {
|
|
llama_hparams hparams;
|
|
|
|
struct ggml_tensor * tok_embeddings;
|
|
|
|
struct ggml_tensor * norm;
|
|
struct ggml_tensor * output;
|
|
|
|
std::vector<llama_layer> layers;
|
|
|
|
// key + value memory
|
|
struct ggml_tensor * memory_k;
|
|
struct ggml_tensor * memory_v;
|
|
|
|
//
|
|
struct ggml_context * ctx;
|
|
std::map<std::string, struct ggml_tensor *> tensors;
|
|
};
|
|
|
|
struct magic {
|
|
uint32_t magic;
|
|
std::atomic<unsigned> lock;
|
|
int fd;
|
|
size_t commit;
|
|
size_t offset;
|
|
size_t capacity;
|
|
gpt_vocab *vocab;
|
|
llama_model *model;
|
|
};
|
|
|
|
static struct magic *mag;
|
|
|
|
static inline void spin_lock(std::atomic<unsigned> &lock) {
|
|
while (!lock.exchange(1, std::memory_order_acquire));
|
|
}
|
|
|
|
static inline void spin_unlock(std::atomic<unsigned> &lock) {
|
|
lock.store(0, std::memory_order_release);
|
|
}
|
|
|
|
static void *Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
|
|
void *res;
|
|
res = mmap(addr, length, prot, flags, fd, offset);
|
|
if (res != MAP_FAILED) return res;
|
|
perror("mmap");
|
|
exit(77);
|
|
}
|
|
|
|
static void magic_commit(void) {
|
|
mag->offset = mag->capacity;
|
|
mag->commit = mag->capacity;
|
|
mag->magic = 0xFEEDABEE;
|
|
msync(mag, mag->commit, MS_ASYNC);
|
|
}
|
|
|
|
static void magic_init(void) {
|
|
int fd;
|
|
size_t n;
|
|
struct stat st;
|
|
if (mag) return;
|
|
n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
|
|
if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
|
|
fstat(fd, &st);
|
|
if (st.st_size >= n) {
|
|
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_FIXED, fd, 0);
|
|
if (mag->magic == 0xFEEDABEE) {
|
|
mag = (struct magic *)Mmap(MAGIC_ADDR, mag->capacity,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_FIXED, fd, 0);
|
|
madvise(MAGIC_ADDR, mag->capacity, MADV_WILLNEED);
|
|
ftruncate(fd, mag->commit);
|
|
mag->offset = mag->commit;
|
|
mag->capacity = mag->commit;
|
|
mag->fd = -1;
|
|
return;
|
|
}
|
|
}
|
|
ftruncate(fd, 0);
|
|
} else if ((fd = open(MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644)) == -1) {
|
|
perror(MAGIC_PATH);
|
|
exit(77);
|
|
}
|
|
ftruncate(fd, n);
|
|
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_SHARED | MAP_FIXED, fd, 0);
|
|
mag->offset = MAGIC_GRAN;
|
|
mag->fd = fd;
|
|
}
|
|
|
|
void *memalign(size_t a, size_t n) {
|
|
void *p;
|
|
size_t i, j, k, m;
|
|
static int count;
|
|
magic_init();
|
|
if (a < MAGIC_ALGN) a = MAGIC_ALGN;
|
|
while (!IS2POW(a)) ++a;
|
|
m = n ? n : 1;
|
|
spin_lock(mag->lock);
|
|
i = mag->offset;
|
|
i = i + sizeof(size_t);
|
|
i = ROUNDUP(i, a);
|
|
j = ROUNDUP(i + m, MAGIC_GRAN);
|
|
if (j > mag->capacity) {
|
|
if (!mag->magic) {
|
|
ftruncate(mag->fd, j);
|
|
p = mmap(MAGIC_ADDR + mag->capacity,
|
|
j - mag->capacity, PROT_READ | PROT_WRITE,
|
|
MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
|
|
} else {
|
|
p = mmap(MAGIC_ADDR + mag->capacity,
|
|
j - mag->capacity, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
|
|
}
|
|
if (p != MAP_FAILED) {
|
|
mag->capacity = j;
|
|
} else {
|
|
spin_unlock(mag->lock);
|
|
return 0;
|
|
}
|
|
}
|
|
mag->offset = i + m;
|
|
spin_unlock(mag->lock);
|
|
p = MAGIC_ADDR + i;
|
|
((size_t *)p)[-1] = n;
|
|
return p;
|
|
}
|
|
|
|
int posix_memalign(void **pp, size_t a, size_t n) {
|
|
int e;
|
|
void *m;
|
|
size_t q, r;
|
|
q = a / sizeof(void *);
|
|
r = a % sizeof(void *);
|
|
if (!r && q && IS2POW(q)) {
|
|
e = errno;
|
|
m = memalign(a, n);
|
|
if (m) {
|
|
*pp = m;
|
|
return 0;
|
|
} else {
|
|
errno = e;
|
|
return ENOMEM;
|
|
}
|
|
} else {
|
|
return EINVAL;
|
|
}
|
|
}
|
|
|
|
void *malloc(size_t n) {
|
|
return memalign(MAGIC_ALGN, n);
|
|
}
|
|
|
|
size_t malloc_usable_size(const void *p) {
|
|
return ((const size_t *)p)[-1];
|
|
}
|
|
|
|
void *calloc(size_t n, size_t z) {
|
|
void *p;
|
|
if ((p = malloc((n *= z)))) {
|
|
memset(p, 0, n);
|
|
}
|
|
return p;
|
|
}
|
|
|
|
void free(void *p) {
|
|
// do nothing
|
|
}
|
|
|
|
void *realloc(void *p, size_t n) {
|
|
void *q;
|
|
if (!p) {
|
|
return malloc(n);
|
|
}
|
|
if (!n) {
|
|
free(p);
|
|
return 0;
|
|
}
|
|
if ((q = malloc(n))) {
|
|
memcpy(q, p, ((const size_t *)p)[-1]);
|
|
}
|
|
return q;
|
|
}
|
|
|
|
// load the model's weights from a file
|
|
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
|
|
|
std::vector<char> f_buf(1024*1024);
|
|
|
|
auto fin = std::ifstream(fname, std::ios::binary);
|
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
|
if (!fin) {
|
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
|
return false;
|
|
}
|
|
|
|
// verify magic
|
|
{
|
|
uint32_t magic;
|
|
fin.read((char *) &magic, sizeof(magic));
|
|
if (magic != 0x67676d6c) {
|
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
int n_ff = 0;
|
|
int n_parts = 0;
|
|
|
|
// load hparams
|
|
{
|
|
auto & hparams = model.hparams;
|
|
|
|
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
|
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
|
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
|
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
|
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
|
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
|
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
|
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
|
|
|
hparams.n_ctx = n_ctx;
|
|
|
|
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
|
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
|
|
|
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
|
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
|
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
|
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
|
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
|
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
|
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
|
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
|
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
|
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
|
}
|
|
|
|
// load vocab
|
|
{
|
|
const int32_t n_vocab = model.hparams.n_vocab;
|
|
|
|
if (n_vocab != model.hparams.n_vocab) {
|
|
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
|
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
|
return false;
|
|
}
|
|
|
|
std::string word;
|
|
for (int i = 0; i < n_vocab; i++) {
|
|
uint32_t len;
|
|
fin.read((char *) &len, sizeof(len));
|
|
|
|
word.resize(len);
|
|
fin.read((char *) word.data(), len);
|
|
|
|
vocab.token_to_id[word] = i;
|
|
vocab.id_to_token[i] = word;
|
|
|
|
//if (i < 30000) {
|
|
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
|
//}
|
|
}
|
|
}
|
|
|
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
|
// in order to save memory and also to speed up the computation
|
|
ggml_type wtype = GGML_TYPE_COUNT;
|
|
switch (model.hparams.f16) {
|
|
case 0: wtype = GGML_TYPE_F32; break;
|
|
case 1: wtype = GGML_TYPE_F16; break;
|
|
case 2: wtype = GGML_TYPE_Q4_0; break;
|
|
case 3: wtype = GGML_TYPE_Q4_1; break;
|
|
default:
|
|
{
|
|
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
|
__func__, fname.c_str(), model.hparams.f16);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const ggml_type wtype2 = GGML_TYPE_F32;
|
|
|
|
auto & ctx = model.ctx;
|
|
|
|
size_t ctx_size = 0;
|
|
|
|
{
|
|
const auto & hparams = model.hparams;
|
|
|
|
const int n_embd = hparams.n_embd;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_ctx = hparams.n_ctx;
|
|
const int n_vocab = hparams.n_vocab;
|
|
|
|
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings
|
|
|
|
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
|
|
|
|
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output
|
|
|
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
|
|
|
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
|
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
|
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
|
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
|
|
|
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
|
|
|
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
|
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
|
|
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
|
|
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
|
|
|
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
}
|
|
|
|
// create the ggml context
|
|
{
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ ctx_size,
|
|
/*.mem_buffer =*/ NULL,
|
|
};
|
|
|
|
model.ctx = ggml_init(params);
|
|
if (!model.ctx) {
|
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// prepare memory for the weights
|
|
{
|
|
const auto & hparams = model.hparams;
|
|
|
|
const int n_embd = hparams.n_embd;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_ctx = hparams.n_ctx;
|
|
const int n_vocab = hparams.n_vocab;
|
|
|
|
model.layers.resize(n_layer);
|
|
|
|
model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
|
|
|
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
|
model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
|
|
|
// map by name
|
|
model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
|
|
|
|
model.tensors["norm.weight"] = model.norm;
|
|
model.tensors["output.weight"] = model.output;
|
|
|
|
for (int i = 0; i < n_layer; ++i) {
|
|
auto & layer = model.layers[i];
|
|
|
|
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
|
|
|
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
|
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
|
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
|
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
|
|
|
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
|
|
|
layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
|
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
|
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
|
|
|
// map by name
|
|
model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
|
|
|
|
model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
|
|
model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
|
|
model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
|
|
model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
|
|
|
|
model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
|
|
|
|
model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
|
|
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
|
|
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
|
|
}
|
|
}
|
|
|
|
// key + value memory
|
|
{
|
|
const auto & hparams = model.hparams;
|
|
|
|
const int n_embd = hparams.n_embd;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_ctx = hparams.n_ctx;
|
|
|
|
const int n_mem = n_layer*n_ctx;
|
|
const int n_elements = n_embd*n_mem;
|
|
|
|
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
|
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
|
|
|
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
|
|
|
fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
|
}
|
|
|
|
const size_t file_offset = fin.tellg();
|
|
|
|
fin.close();
|
|
|
|
std::vector<uint8_t> tmp;
|
|
|
|
for (int i = 0; i < n_parts; ++i) {
|
|
const int part_id = i;
|
|
//const int part_id = n_parts - i - 1;
|
|
|
|
std::string fname_part = fname;
|
|
if (i > 0) {
|
|
fname_part += "." + std::to_string(i);
|
|
}
|
|
|
|
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
|
|
|
fin = std::ifstream(fname_part, std::ios::binary);
|
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
|
fin.seekg(file_offset);
|
|
|
|
// load weights
|
|
{
|
|
int n_tensors = 0;
|
|
size_t total_size = 0;
|
|
|
|
fprintf(stderr, "%s: ", __func__);
|
|
|
|
while (true) {
|
|
int32_t n_dims;
|
|
int32_t length;
|
|
int32_t ftype;
|
|
|
|
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
|
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
|
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
|
|
|
if (fin.eof()) {
|
|
break;
|
|
}
|
|
|
|
int32_t nelements = 1;
|
|
int32_t ne[2] = { 1, 1 };
|
|
for (int i = 0; i < n_dims; ++i) {
|
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
|
nelements *= ne[i];
|
|
}
|
|
|
|
std::string name(length, 0);
|
|
fin.read(&name[0], length);
|
|
|
|
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
|
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
|
return false;
|
|
}
|
|
|
|
// split_type = 0: split by columns
|
|
// split_type = 1: split by rows
|
|
int split_type = 0;
|
|
|
|
// split_type = 0:
|
|
// regex:
|
|
// - tok_embeddings.*
|
|
// - layers.*.attention.wo.weight
|
|
// - layers.*.feed_forward.w2.weight
|
|
|
|
// split_type = 1:
|
|
// regex:
|
|
// - output.*
|
|
// - layers.*.attention.wq.weight
|
|
// - layers.*.attention.wk.weight
|
|
// - layers.*.attention.wv.weight
|
|
// - layers.*.feed_forward.w1.weight
|
|
// - layers.*.feed_forward.w3.weight
|
|
if (name.find("tok_embeddings") != std::string::npos) {
|
|
split_type = 0;
|
|
} else if (name.find("layers") != std::string::npos) {
|
|
if (name.find("attention.wo.weight") != std::string::npos) {
|
|
split_type = 0;
|
|
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
|
split_type = 0;
|
|
} else {
|
|
split_type = 1;
|
|
}
|
|
} else if (name.find("output") != std::string::npos) {
|
|
split_type = 1;
|
|
}
|
|
|
|
auto tensor = model.tensors[name.data()];
|
|
|
|
if (n_dims == 1) {
|
|
if (ggml_nelements(tensor) != nelements) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
|
return false;
|
|
}
|
|
} else {
|
|
if (ggml_nelements(tensor)/n_parts != nelements) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (n_dims == 1) {
|
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
|
return false;
|
|
}
|
|
} else {
|
|
if (split_type == 0) {
|
|
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
|
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
|
|
return false;
|
|
}
|
|
} else {
|
|
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
|
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (0) {
|
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
|
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
|
}
|
|
|
|
size_t bpe = 0;
|
|
|
|
switch (ftype) {
|
|
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
|
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
|
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
|
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
|
default:
|
|
{
|
|
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
|
return false;
|
|
}
|
|
};
|
|
|
|
if (n_dims == 1 || n_parts == 1) {
|
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
|
return false;
|
|
}
|
|
|
|
if (part_id == 0) {
|
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
|
} else {
|
|
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
|
|
}
|
|
|
|
total_size += ggml_nbytes(tensor);
|
|
} else {
|
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
|
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
|
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
|
|
return false;
|
|
}
|
|
|
|
if (split_type == 0) {
|
|
const int np0 = ne[0];
|
|
|
|
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
|
assert(row_size == tensor->nb[1]);
|
|
|
|
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
|
const size_t offset_row = i1*row_size;
|
|
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
|
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
|
|
}
|
|
} else {
|
|
const int np1 = ne[1];
|
|
|
|
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
|
|
|
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
|
const size_t offset_row = (i1 + part_id*np1)*row_size;
|
|
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
|
|
}
|
|
}
|
|
|
|
total_size += ggml_nbytes(tensor)/n_parts;
|
|
}
|
|
|
|
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
if (++n_tensors % 8 == 0) {
|
|
fprintf(stderr, ".");
|
|
fflush(stderr);
|
|
}
|
|
}
|
|
|
|
fprintf(stderr, " done\n");
|
|
|
|
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
|
}
|
|
|
|
fin.close();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// evaluate the transformer
|
|
//
|
|
// - model: the model
|
|
// - n_threads: number of threads to use
|
|
// - n_past: the context size so far
|
|
// - embd_inp: the embeddings of the tokens in the context
|
|
// - embd_w: the predicted logits for the next token
|
|
//
|
|
// The GPT-J model requires about 16MB of memory per input token.
|
|
//
|
|
bool llama_eval(
|
|
const llama_model & model,
|
|
const int n_threads,
|
|
const int n_past,
|
|
const std::vector<gpt_vocab::id> & embd_inp,
|
|
std::vector<float> & embd_w,
|
|
size_t & mem_per_token) {
|
|
const int N = embd_inp.size();
|
|
|
|
const auto & hparams = model.hparams;
|
|
|
|
const int n_embd = hparams.n_embd;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_ctx = hparams.n_ctx;
|
|
const int n_head = hparams.n_head;
|
|
const int n_vocab = hparams.n_vocab;
|
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
|
|
|
const int d_key = n_embd/n_head;
|
|
|
|
static size_t buf_size = 512u*1024*1024;
|
|
static void * buf = malloc(buf_size);
|
|
|
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
|
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
|
|
|
// reallocate
|
|
buf_size = buf_size_new;
|
|
buf = realloc(buf, buf_size);
|
|
if (buf == nullptr) {
|
|
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ buf_size,
|
|
/*.mem_buffer =*/ buf,
|
|
};
|
|
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
ggml_cgraph gf = {};
|
|
gf.n_threads = n_threads;
|
|
|
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
|
|
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
struct ggml_tensor * inpSA = inpL;
|
|
|
|
struct ggml_tensor * cur;
|
|
|
|
// norm
|
|
{
|
|
cur = ggml_norm(ctx0, inpL);
|
|
|
|
// cur = attention_norm*cur
|
|
cur = ggml_mul(ctx0,
|
|
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
|
cur);
|
|
}
|
|
|
|
// self-attention
|
|
{
|
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
|
|
// store key and value to memory
|
|
if (N >= 1) {
|
|
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
|
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
|
|
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
|
}
|
|
|
|
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
|
struct ggml_tensor * Q =
|
|
ggml_permute(ctx0,
|
|
ggml_rope(ctx0,
|
|
ggml_cpy(ctx0,
|
|
Qcur,
|
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
|
n_past, n_rot, 0),
|
|
0, 2, 1, 3);
|
|
|
|
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
|
struct ggml_tensor * K =
|
|
ggml_permute(ctx0,
|
|
ggml_rope(ctx0,
|
|
ggml_reshape_3d(ctx0,
|
|
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
|
n_embd/n_head, n_head, n_past + N),
|
|
n_past, n_rot, 1),
|
|
0, 2, 1, 3);
|
|
|
|
// K * Q
|
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
|
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
|
struct ggml_tensor * KQ_scaled =
|
|
ggml_scale(ctx0,
|
|
KQ,
|
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
|
);
|
|
|
|
// KQ_masked = mask_past(KQ_scaled)
|
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
|
|
|
// KQ = soft_max(KQ_masked)
|
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
|
|
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
|
struct ggml_tensor * V_trans =
|
|
ggml_permute(ctx0,
|
|
ggml_reshape_3d(ctx0,
|
|
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
|
n_embd/n_head, n_head, n_past + N),
|
|
1, 2, 0, 3);
|
|
|
|
// KQV = transpose(V) * KQ_soft_max
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
|
|
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
|
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
|
cur = ggml_cpy(ctx0,
|
|
KQV_merged,
|
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
|
|
|
// projection (no bias)
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].wo,
|
|
cur);
|
|
}
|
|
|
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
|
|
|
// feed-forward network
|
|
{
|
|
// norm
|
|
{
|
|
cur = ggml_norm(ctx0, inpFF);
|
|
|
|
// cur = ffn_norm*cur
|
|
cur = ggml_mul(ctx0,
|
|
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
|
cur);
|
|
}
|
|
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
model.layers[il].w3,
|
|
cur);
|
|
|
|
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].w1,
|
|
cur);
|
|
|
|
// SILU activation
|
|
cur = ggml_silu(ctx0, cur);
|
|
|
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].w2,
|
|
cur);
|
|
}
|
|
|
|
cur = ggml_add(ctx0, cur, inpFF);
|
|
|
|
// input for next layer
|
|
inpL = cur;
|
|
}
|
|
|
|
// norm
|
|
{
|
|
inpL = ggml_norm(ctx0, inpL);
|
|
|
|
// inpL = norm*inpL
|
|
inpL = ggml_mul(ctx0,
|
|
ggml_repeat(ctx0, model.norm, inpL),
|
|
inpL);
|
|
}
|
|
|
|
// lm_head
|
|
{
|
|
inpL = ggml_mul_mat(ctx0, model.output, inpL);
|
|
}
|
|
|
|
// logits -> probs
|
|
//inpL = ggml_soft_max(ctx0, inpL);
|
|
|
|
// run the computation
|
|
ggml_build_forward_expand(&gf, inpL);
|
|
ggml_graph_compute (ctx0, &gf);
|
|
|
|
//if (n_past%100 == 0) {
|
|
// ggml_graph_print (&gf);
|
|
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
|
//}
|
|
|
|
//embd_w.resize(n_vocab*N);
|
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
|
|
|
// return result for just the last token
|
|
embd_w.resize(n_vocab);
|
|
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
|
|
|
if (mem_per_token == 0) {
|
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
|
}
|
|
//fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
|
|
|
|
ggml_free(ctx0);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool is_interacting = false;
|
|
|
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
void sigint_handler(int signo) {
|
|
if (signo == SIGINT) {
|
|
if (!is_interacting) {
|
|
is_interacting=true;
|
|
} else {
|
|
_exit(130);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
const char * llama_print_system_info(void) {
|
|
static std::string s;
|
|
|
|
s = "";
|
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
|
|
return s.c_str();
|
|
}
|
|
|
|
int main(int argc, char ** argv) {
|
|
magic_init();
|
|
|
|
ggml_time_init();
|
|
const int64_t t_main_start_us = ggml_time_us();
|
|
|
|
gpt_params params;
|
|
params.model = "models/llama-7B/ggml-model.bin";
|
|
|
|
if (gpt_params_parse(argc, argv, params) == false) {
|
|
return 1;
|
|
}
|
|
|
|
if (params.seed < 0) {
|
|
params.seed = time(NULL);
|
|
}
|
|
|
|
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
|
|
|
std::mt19937 rng(params.seed);
|
|
if (params.prompt.empty()) {
|
|
params.prompt = gpt_random_prompt(rng);
|
|
}
|
|
|
|
// params.prompt = R"(// this function checks if the number n is prime
|
|
//bool is_prime(int n) {)";
|
|
|
|
int64_t t_load_us = 0;
|
|
|
|
// load the model
|
|
gpt_vocab *vocab;
|
|
llama_model *model;
|
|
if (!mag->magic) {
|
|
vocab = new gpt_vocab;
|
|
model = new llama_model;
|
|
const int64_t t_start_us = ggml_time_us();
|
|
if (!llama_model_load(params.model, *model, *vocab, 512)) { // TODO: set context from user input ??
|
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
|
return 1;
|
|
}
|
|
t_load_us = ggml_time_us() - t_start_us;
|
|
mag->vocab = vocab;
|
|
mag->model = model;
|
|
magic_commit();
|
|
} else {
|
|
vocab = mag->vocab;
|
|
model = mag->model;
|
|
}
|
|
|
|
// print system information
|
|
{
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
|
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
|
}
|
|
|
|
int n_past = 0;
|
|
|
|
int64_t t_sample_us = 0;
|
|
int64_t t_predict_us = 0;
|
|
|
|
std::vector<float> logits;
|
|
|
|
// tokenize the prompt
|
|
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(*vocab, params.prompt, true);
|
|
|
|
params.n_predict = std::min(params.n_predict, model->hparams.n_ctx - (int) embd_inp.size());
|
|
|
|
// tokenize the reverse prompt
|
|
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(*vocab, params.antiprompt, false);
|
|
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab->id_to_token.at(embd_inp[i]).c_str());
|
|
}
|
|
fprintf(stderr, "\n");
|
|
if (params.interactive) {
|
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
struct sigaction sigint_action;
|
|
sigint_action.sa_handler = sigint_handler;
|
|
sigemptyset (&sigint_action.sa_mask);
|
|
sigint_action.sa_flags = 0;
|
|
sigaction(SIGINT, &sigint_action, NULL);
|
|
#endif
|
|
|
|
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
|
|
|
if(antiprompt_inp.size()) {
|
|
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
|
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
|
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
|
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab->id_to_token.at(antiprompt_inp[i]).c_str());
|
|
}
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
|
fprintf(stderr, "\n\n");
|
|
|
|
std::vector<gpt_vocab::id> embd;
|
|
|
|
// determine the required inference memory per token:
|
|
size_t mem_per_token = 0;
|
|
llama_eval(*model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
|
|
|
int last_n_size = params.repeat_last_n;
|
|
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
|
|
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
|
|
|
|
|
if (params.interactive) {
|
|
fprintf(stderr, "== Running in interactive mode. ==\n"
|
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
" - Press Ctrl+C to interject at any time.\n"
|
|
#endif
|
|
" - Press Return to return control to LLaMa.\n"
|
|
" - If you want to submit another line, end your input in '\\'.\n");
|
|
}
|
|
|
|
int remaining_tokens = params.n_predict;
|
|
int input_consumed = 0;
|
|
bool input_noecho = false;
|
|
|
|
// prompt user immediately after the starting prompt has been loaded
|
|
if (params.interactive_start) {
|
|
is_interacting = true;
|
|
}
|
|
|
|
// set the color for the prompt which will be output initially
|
|
if (params.use_color) {
|
|
printf(ANSI_COLOR_YELLOW);
|
|
}
|
|
|
|
while (remaining_tokens > 0) {
|
|
// predict
|
|
if (embd.size() > 0) {
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
if (!llama_eval(*model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
|
fprintf(stderr, "Failed to predict\n");
|
|
return 1;
|
|
}
|
|
|
|
t_predict_us += ggml_time_us() - t_start_us;
|
|
}
|
|
|
|
n_past += embd.size();
|
|
embd.clear();
|
|
|
|
if (embd_inp.size() <= input_consumed) {
|
|
// out of user input, sample next token
|
|
const float top_k = params.top_k;
|
|
const float top_p = params.top_p;
|
|
const float temp = params.temp;
|
|
const float repeat_penalty = params.repeat_penalty;
|
|
|
|
const int n_vocab = model->hparams.n_vocab;
|
|
|
|
gpt_vocab::id id = 0;
|
|
|
|
{
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
id = llama_sample_top_p_top_k(*vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
|
|
|
|
last_n_tokens.erase(last_n_tokens.begin());
|
|
last_n_tokens.push_back(id);
|
|
|
|
t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
|
|
// add it to the context
|
|
embd.push_back(id);
|
|
|
|
// echo this to console
|
|
input_noecho = false;
|
|
|
|
// decrement remaining sampling budget
|
|
--remaining_tokens;
|
|
} else {
|
|
// some user input remains from prompt or interaction, forward it to processing
|
|
while (embd_inp.size() > input_consumed) {
|
|
embd.push_back(embd_inp[input_consumed]);
|
|
last_n_tokens.erase(last_n_tokens.begin());
|
|
last_n_tokens.push_back(embd_inp[input_consumed]);
|
|
++input_consumed;
|
|
if (embd.size() > params.n_batch) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// reset color to default if we there is no pending user input
|
|
if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
|
|
printf(ANSI_COLOR_RESET);
|
|
}
|
|
}
|
|
|
|
// display text
|
|
if (!input_noecho) {
|
|
for (auto id : embd) {
|
|
printf("%s", vocab->id_to_token[id].c_str());
|
|
}
|
|
fflush(stdout);
|
|
}
|
|
|
|
// in interactive mode, and not currently processing queued inputs;
|
|
// check if we should prompt the user for more
|
|
if (params.interactive && embd_inp.size() <= input_consumed) {
|
|
// check for reverse prompt
|
|
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
|
|
// reverse prompt found
|
|
is_interacting = true;
|
|
}
|
|
if (is_interacting) {
|
|
// currently being interactive
|
|
bool another_line=true;
|
|
while (another_line) {
|
|
fflush(stdout);
|
|
char buf[256] = {0};
|
|
int n_read;
|
|
if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
|
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
|
|
// presumable empty line, consume the newline
|
|
scanf("%*c");
|
|
n_read=0;
|
|
}
|
|
if(params.use_color) printf(ANSI_COLOR_RESET);
|
|
|
|
if (n_read > 0 && buf[n_read-1]=='\\') {
|
|
another_line = true;
|
|
buf[n_read-1] = '\n';
|
|
buf[n_read] = 0;
|
|
} else {
|
|
another_line = false;
|
|
buf[n_read] = '\n';
|
|
buf[n_read+1] = 0;
|
|
}
|
|
|
|
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(*vocab, buf, false);
|
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
|
|
remaining_tokens -= line_inp.size();
|
|
|
|
input_noecho = true; // do not echo this again
|
|
}
|
|
|
|
is_interacting = false;
|
|
}
|
|
}
|
|
|
|
// end of text token
|
|
if (embd.back() == 2) {
|
|
fprintf(stderr, " [end of text]\n");
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
// report timing
|
|
{
|
|
const int64_t t_main_end_us = ggml_time_us();
|
|
|
|
fprintf(stderr, "\n\n");
|
|
fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
|
|
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
|
|
fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
|
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
|
}
|
|
|
|
ggml_free(model->ctx);
|
|
|
|
return 0;
|
|
}
|