llama : avoid double tensor copy when saving session to buffer

This commit is contained in:
Francis Couture-Harpin 2024-08-07 16:03:17 -04:00
parent dca7ad8627
commit 9329953a61

View File

@ -17343,17 +17343,10 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
// TODO: replace all non-fatal assertions with returned errors or exceptions // TODO: replace all non-fatal assertions with returned errors or exceptions
struct llama_data_write { struct llama_data_write {
virtual void write(const void * src, size_t size) = 0; virtual void write(const void * src, size_t size) = 0;
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
virtual size_t get_size_written() = 0; virtual size_t get_size_written() = 0;
virtual ~llama_data_write() = default; virtual ~llama_data_write() = default;
std::vector<uint8_t> temp_buffer;
virtual void * get_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) {
temp_buffer.resize(size);
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
return temp_buffer.data();
}
void write_string(const std::string & str) { void write_string(const std::string & str) {
uint32_t str_size = str.size(); uint32_t str_size = str.size();
@ -17474,8 +17467,7 @@ struct llama_data_write {
for (const auto & range : cell_ranges) { for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first; const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row; const size_t buf_size = range_size * k_size_row;
const void * data = get_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
write(data, buf_size);
} }
} }
@ -17495,8 +17487,7 @@ struct llama_data_write {
for (const auto & range : cell_ranges) { for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first; const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row; const size_t buf_size = range_size * v_size_row;
const void * data = get_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
write(data, buf_size);
} }
} }
} else { } else {
@ -17523,8 +17514,7 @@ struct llama_data_write {
const size_t range_size = range.second - range.first; const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el; const size_t src_offset = (range.first + j * kv_size) * v_size_el;
const size_t buf_size = range_size * v_size_el; const size_t buf_size = range_size * v_size_el;
const void * data = get_tensor_data(kv_self.v_l[il], src_offset, buf_size); write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
write(data, buf_size);
} }
} }
} }
@ -17883,14 +17873,12 @@ struct llama_data_write_dummy : llama_data_write {
llama_data_write_dummy() {} llama_data_write_dummy() {}
// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
void write(const void * /* src */, size_t size) override { void write(const void * /* src */, size_t size) override {
size_written += size; size_written += size;
} }
void * get_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t /* size */) override { void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
return nullptr; size_written += size;
} }
size_t get_size_written() override { size_t get_size_written() override {
@ -17915,6 +17903,16 @@ struct llama_data_write_buffer : llama_data_write {
buf_size -= size; buf_size -= size;
} }
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
ggml_backend_tensor_get(tensor, ptr, offset, size);
ptr += size;
size_written += size;
buf_size -= size;
}
size_t get_size_written() override { size_t get_size_written() override {
return size_written; return size_written;
} }
@ -17950,6 +17948,7 @@ struct llama_data_read_buffer : llama_data_read {
struct llama_data_write_file : llama_data_write { struct llama_data_write_file : llama_data_write {
llama_file * file; llama_file * file;
size_t size_written = 0; size_t size_written = 0;
std::vector<uint8_t> temp_buffer;
llama_data_write_file(llama_file * f) : file(f) {} llama_data_write_file(llama_file * f) : file(f) {}
@ -17958,6 +17957,12 @@ struct llama_data_write_file : llama_data_write {
size_written += size; size_written += size;
} }
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
temp_buffer.resize(size);
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
write(temp_buffer.data(), temp_buffer.size());
}
size_t get_size_written() override { size_t get_size_written() override {
return size_written; return size_written;
} }