mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 11:23:56 +01:00
rebase to master (except ggml-cuda)
This commit is contained in:
parent
33ab185dd1
commit
9c72e7e916
@ -289,7 +289,7 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t
|
|||||||
|
|
||||||
if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) {
|
if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) {
|
||||||
if (splits->n_splits > 0) {
|
if (splits->n_splits > 0) {
|
||||||
char name[GGML_MAX_NAME - 1]; // silence -Wformat-truncation
|
char name[GGML_MAX_NAME];
|
||||||
vsnprintf(name, sizeof(name), fmt, args);
|
vsnprintf(name, sizeof(name), fmt, args);
|
||||||
char new_name[GGML_MAX_NAME];
|
char new_name[GGML_MAX_NAME];
|
||||||
snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name);
|
snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name);
|
||||||
|
30
ggml-cuda.cu
30
ggml-cuda.cu
@ -1475,8 +1475,8 @@ static void ggml_cuda_mul_mat(ggml_cuda_context * ctx, ggml_tensor * src0, ggml_
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) {
|
static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) {
|
||||||
ggml_tensor * src0 = node->src0;
|
ggml_tensor * src0 = node->src[0];
|
||||||
ggml_tensor * src1 = node->src1;
|
ggml_tensor * src1 = node->src[1];
|
||||||
ggml_tensor * dst = node;
|
ggml_tensor * dst = node;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@ -1551,8 +1551,6 @@ static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cud
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const int GGML_MAX_PARENTS = 2 + GGML_MAX_OPT;
|
|
||||||
|
|
||||||
static bool ggml_is_noop(ggml_tensor * t) {
|
static bool ggml_is_noop(ggml_tensor * t) {
|
||||||
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE;
|
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE;
|
||||||
@ -1581,26 +1579,20 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph *
|
|||||||
ggml_tensor * node = gf->nodes[i];
|
ggml_tensor * node = gf->nodes[i];
|
||||||
const bool is_noop = ggml_is_noop(node);
|
const bool is_noop = ggml_is_noop(node);
|
||||||
|
|
||||||
// build a list of parents
|
|
||||||
ggml_tensor * parents[GGML_MAX_PARENTS] = { node->src0, node->src1 };
|
|
||||||
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
|
||||||
parents[j + 2] = node->opt[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
// assign an stream for the node
|
// assign an stream for the node
|
||||||
cudaStream_t stream = nullptr;
|
cudaStream_t stream = nullptr;
|
||||||
|
|
||||||
// take a stream from a parent
|
// take a stream from a parent
|
||||||
for (int j = 0; j < GGML_MAX_PARENTS; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (parents[j] && stream_map.count(parents[j]) && stream_map[parents[j]] != nullptr) {
|
if (node->src[j] && stream_map.count(node->src[j]) && stream_map[node->src[j]] != nullptr) {
|
||||||
stream = stream_map[parents[j]];
|
stream = stream_map[node->src[j]];
|
||||||
stream_map.erase(parents[j]);
|
stream_map.erase(node->src[j]);
|
||||||
|
|
||||||
if (is_noop) {
|
if (is_noop) {
|
||||||
// if this is a noop, we can use the parent's event
|
// if this is a noop, we can use the parent's event
|
||||||
stream_map[node] = stream;
|
stream_map[node] = stream;
|
||||||
if (event_map.count(parents[j]) > 0) {
|
if (event_map.count(node->src[j]) > 0) {
|
||||||
event_map[node] = event_map[parents[j]];
|
event_map[node] = event_map[node->src[j]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -1624,9 +1616,9 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph *
|
|||||||
|
|
||||||
// wait on parent streams
|
// wait on parent streams
|
||||||
bool waited = false;
|
bool waited = false;
|
||||||
for (int j = 0; j < GGML_MAX_PARENTS; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (parents[j] && event_map.count(parents[j]) > 0) {
|
if (node->src[j] && event_map.count(node->src[j]) > 0) {
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[parents[j]], 0));
|
CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[node->src[j]], 0));
|
||||||
waited = true;
|
waited = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
11
ggml.c
11
ggml.c
@ -6855,7 +6855,9 @@ struct ggml_tensor * ggml_rope_impl(
|
|||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
// TODO: just use a struct
|
// TODO: just use a struct
|
||||||
int32_t params[] = { n_past, n_dims, mode, n_ctx, *(int32_t*)&freq_base, *(int32_t*)&freq_scale};
|
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
||||||
|
memcpy(params + 4, &freq_base, sizeof(float));
|
||||||
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
||||||
assert(GGML_MAX_OP_PARAMS >= sizeof(params));
|
assert(GGML_MAX_OP_PARAMS >= sizeof(params));
|
||||||
memcpy(result->params, ¶ms, sizeof(params));
|
memcpy(result->params, ¶ms, sizeof(params));
|
||||||
|
|
||||||
@ -7127,13 +7129,11 @@ struct ggml_tensor* ggml_pool_1d(
|
|||||||
};
|
};
|
||||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
|
||||||
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||||
((int32_t*)c->data)[0] = op;
|
((int32_t*)c->data)[0] = op;
|
||||||
((int32_t*)c->data)[1] = k0;
|
((int32_t*)c->data)[1] = k0;
|
||||||
((int32_t*)c->data)[2] = s0;
|
((int32_t*)c->data)[2] = s0;
|
||||||
((int32_t*)c->data)[3] = p0;
|
((int32_t*)c->data)[3] = p0;
|
||||||
ggml_scratch_load(ctx);
|
|
||||||
|
|
||||||
result->op = GGML_OP_POOL_1D;
|
result->op = GGML_OP_POOL_1D;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -7170,7 +7170,6 @@ struct ggml_tensor* ggml_pool_2d(
|
|||||||
};
|
};
|
||||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
|
||||||
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
|
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
|
||||||
((int32_t*)c->data)[0] = op;
|
((int32_t*)c->data)[0] = op;
|
||||||
((int32_t*)c->data)[1] = k0;
|
((int32_t*)c->data)[1] = k0;
|
||||||
@ -7179,7 +7178,6 @@ struct ggml_tensor* ggml_pool_2d(
|
|||||||
((int32_t*)c->data)[4] = s1;
|
((int32_t*)c->data)[4] = s1;
|
||||||
((int32_t*)c->data)[5] = p0;
|
((int32_t*)c->data)[5] = p0;
|
||||||
((int32_t*)c->data)[6] = p1;
|
((int32_t*)c->data)[6] = p1;
|
||||||
ggml_scratch_load(ctx);
|
|
||||||
|
|
||||||
result->op = GGML_OP_POOL_2D;
|
result->op = GGML_OP_POOL_2D;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -15823,7 +15821,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->op == GGML_OP_NONE && node->src0 == NULL && node->src1 == NULL && node->grad == NULL) {
|
// TODO: add ggml_dependency instead of checking for NULL
|
||||||
|
if (node->op == GGML_OP_NONE && node->src[0] == NULL && node->src[1] == NULL && node->grad == NULL) {
|
||||||
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
||||||
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
2
ggml.h
2
ggml.h
@ -199,7 +199,7 @@
|
|||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 6
|
||||||
#define GGML_MAX_NAME 48
|
#define GGML_MAX_NAME 48
|
||||||
#define GGML_MAX_OP_PARAMS 16
|
#define GGML_MAX_OP_PARAMS 32
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
|
|
||||||
|
@ -1168,7 +1168,7 @@ static ggml_graph_splits llama_build_graph(
|
|||||||
|
|
||||||
struct ggml_graph_splits splits = ggml_graph_split_init();
|
struct ggml_graph_splits splits = ggml_graph_split_init();
|
||||||
|
|
||||||
// initalize contexts for every backend
|
// initialize contexts for every backend
|
||||||
|
|
||||||
struct ggml_context * ctx_cpu = nullptr;
|
struct ggml_context * ctx_cpu = nullptr;
|
||||||
// TODO: don't create context if there are no CPU layers
|
// TODO: don't create context if there are no CPU layers
|
||||||
@ -1295,8 +1295,8 @@ static ggml_graph_splits llama_build_graph(
|
|||||||
// TODO: replace with ggml_dependency / ggml_depends_on
|
// TODO: replace with ggml_dependency / ggml_depends_on
|
||||||
k = ggml_view_tensor(ctx_kv, kv_self.k);
|
k = ggml_view_tensor(ctx_kv, kv_self.k);
|
||||||
v = ggml_view_tensor(ctx_kv, kv_self.v);
|
v = ggml_view_tensor(ctx_kv, kv_self.v);
|
||||||
k->src0 = k_cpy;
|
k->src[0] = k_cpy;
|
||||||
v->src0 = v_cpy;
|
v->src[0] = v_cpy;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * Q =
|
struct ggml_tensor * Q =
|
||||||
|
Loading…
Reference in New Issue
Block a user