mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-29 07:34:18 +01:00
ggml : sync latest ggml_mul_mat_id
This commit is contained in:
parent
a3eefe95a8
commit
861cd67899
73
ggml-cuda.cu
73
ggml-cuda.cu
@ -1,13 +1,15 @@
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <atomic>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cinttypes>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <atomic>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
@ -8234,36 +8236,51 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
|
||||
static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#if 0
|
||||
//#ifdef CUDA_USE_TENSOR_CORES
|
||||
// const bool use_tensor_cores = true;
|
||||
//#else
|
||||
// const bool use_tensor_cores = false;
|
||||
//#endif
|
||||
|
||||
ggml_cuda_mul_mat_id_cublas(dst);
|
||||
|
||||
// TODO: mmq/mmv support
|
||||
#else
|
||||
const struct ggml_tensor * ids = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
const int id = dst->op_params[0];
|
||||
|
||||
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||
|
||||
int32_t a_id;
|
||||
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
|
||||
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
||||
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
||||
|
||||
ggml_cuda_mul_mat(src0, src1, dst);
|
||||
#endif
|
||||
|
||||
(void) _src0;
|
||||
(void) _src1;
|
||||
const struct ggml_tensor * ids = src0;
|
||||
const int32_t id = dst->op_params[0];
|
||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
|
||||
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
||||
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
ggml_tensor_extra_gpu src1_row_extra;
|
||||
ggml_tensor_extra_gpu dst_row_extra;
|
||||
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.ne[1] = 1;
|
||||
dst_row.ne[1] = 1;
|
||||
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||
|
||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
|
||||
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
||||
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
||||
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
||||
|
||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
21
ggml-metal.m
21
ggml-metal.m
@ -177,6 +177,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
||||
} else {
|
||||
char* buffer2 = malloc(len+1);
|
||||
va_end(args);
|
||||
va_start(args, format);
|
||||
vsnprintf(buffer2, len+1, format, args);
|
||||
buffer2[len] = 0;
|
||||
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
||||
@ -1193,7 +1195,9 @@ void ggml_metal_graph_compute(
|
||||
const float scale = ((float *) dst->op_params)[0];
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
if (id_src1) {
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
@ -1511,9 +1515,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
|
||||
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3];
|
||||
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4];
|
||||
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5];
|
||||
@ -1523,7 +1525,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:13];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:14];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:15];
|
||||
@ -1538,7 +1540,14 @@ void ggml_metal_graph_compute(
|
||||
}
|
||||
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
|
||||
for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
61
ggml.c
61
ggml.c
@ -4083,7 +4083,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
||||
int64_t n_as = ids->ne[0];
|
||||
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_vector(ids));
|
||||
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
||||
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
||||
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
||||
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
||||
GGML_ASSERT(id >= 0 && id < n_as);
|
||||
|
||||
@ -9519,11 +9521,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
}
|
||||
#endif
|
||||
|
||||
// off1 = offset in i11 and i1
|
||||
// cne1 = ne11 and ne1
|
||||
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
||||
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
||||
static void ggml_compute_forward_mul_mat(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
struct ggml_tensor * dst,
|
||||
int64_t off1, int64_t cne1) {
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
@ -9591,10 +9598,9 @@ static void ggml_compute_forward_mul_mat(
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
||||
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
@ -9611,10 +9617,10 @@ static void ggml_compute_forward_mul_mat(
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
cne1, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
@ -9630,6 +9636,7 @@ static void ggml_compute_forward_mul_mat(
|
||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||
|
||||
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||
assert(src1->type == GGML_TYPE_F32);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
@ -9652,7 +9659,7 @@ static void ggml_compute_forward_mul_mat(
|
||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
||||
|
||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||
|
||||
@ -9694,9 +9701,9 @@ static void ggml_compute_forward_mul_mat(
|
||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*ne11));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
||||
const int64_t i13 = (ir1/(ne12*cne1));
|
||||
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
||||
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
||||
|
||||
// broadcast src0 into src1
|
||||
const int64_t i03 = i13/r3;
|
||||
@ -9736,20 +9743,26 @@ static void ggml_compute_forward_mul_mat(
|
||||
|
||||
static void ggml_compute_forward_mul_mat_id(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * ids = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
||||
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
const struct ggml_tensor * ids = src0;
|
||||
const int id = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
const int a_id = ((int32_t *)ids->data)[id];
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
|
||||
|
||||
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
||||
|
||||
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_out_prod
|
||||
@ -14037,11 +14050,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
ggml_compute_forward_mul_mat_id(params, tensor);
|
||||
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
|
@ -770,11 +770,9 @@ struct test_mul_mat_id : public test_case {
|
||||
const int64_t m;
|
||||
const int64_t n;
|
||||
const int64_t k;
|
||||
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||||
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr);
|
||||
return VARS_TO_STR7(type_a, type_b, n_mats, id, m, n, k);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
@ -782,7 +780,7 @@ struct test_mul_mat_id : public test_case {
|
||||
}
|
||||
|
||||
size_t op_size(ggml_tensor * t) override {
|
||||
size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1];
|
||||
size_t a = ggml_nbytes(t->src[2]) * n;
|
||||
size_t b = ggml_nbytes(t->src[1]) * m;
|
||||
size_t c = ggml_nbytes(t);
|
||||
return a + b + c;
|
||||
@ -792,35 +790,37 @@ struct test_mul_mat_id : public test_case {
|
||||
|
||||
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
||||
int n_mats = 2, int id = 0,
|
||||
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
||||
std::array<int64_t, 2> bs = {10, 10},
|
||||
std::array<int64_t, 2> nr = {2, 2})
|
||||
int64_t m = 32, int64_t n = 32, int64_t k = 32)
|
||||
: type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
|
||||
m(m), n(n), k(k), bs(bs), nr(nr) {}
|
||||
m(m), n(n), k(k) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||||
std::vector<ggml_tensor *> mats;
|
||||
for (int i = 0; i < n_mats; i++) {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
|
||||
mats.push_back(a);
|
||||
}
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats);
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
||||
ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
|
||||
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// ids
|
||||
std::vector<int> data(n_mats);
|
||||
for (int i = 0; i < n_mats; i++) {
|
||||
data[i] = i;
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()()));
|
||||
ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
@ -1215,7 +1215,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
for (int n_mats : {1, 2, 4}) {
|
||||
for (int id = 0; id < n_mats; id++) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user