ggml : sync latest ggml_mul_mat_id

This commit is contained in:
Georgi Gerganov 2023-12-09 11:19:46 +02:00
parent a3eefe95a8
commit 861cd67899
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 114 additions and 75 deletions

View File

@ -1,13 +1,15 @@
#include <algorithm> #include <algorithm>
#include <assert.h>
#include <atomic>
#include <cinttypes>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <cinttypes>
#include <float.h> #include <float.h>
#include <limits> #include <limits>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <atomic> #include <vector>
#include <assert.h>
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
@ -8234,36 +8236,51 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
} }
#endif #endif
static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#if 0 #if 0
//#ifdef CUDA_USE_TENSOR_CORES
// const bool use_tensor_cores = true;
//#else
// const bool use_tensor_cores = false;
//#endif
ggml_cuda_mul_mat_id_cublas(dst); ggml_cuda_mul_mat_id_cublas(dst);
// TODO: mmq/mmv support // TODO: mmq/mmv support
#else
const struct ggml_tensor * ids = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const int id = dst->op_params[0];
int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
int32_t a_id;
CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
const struct ggml_tensor * src0 = dst->src[a_id + 2];
ggml_cuda_mul_mat(src0, src1, dst);
#endif #endif
(void) _src0; const struct ggml_tensor * ids = src0;
(void) _src1; const int32_t id = dst->op_params[0];
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
std::vector<char> ids_host(ggml_nbytes(ids));
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
ggml_tensor_extra_gpu src1_row_extra;
ggml_tensor_extra_gpu dst_row_extra;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
src1_row.ne[1] = 1;
dst_row.ne[1] = 1;
src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
//int32_t row_id;
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
}
} }
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

View File

@ -177,6 +177,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
} else { } else {
char* buffer2 = malloc(len+1); char* buffer2 = malloc(len+1);
va_end(args);
va_start(args, format);
vsnprintf(buffer2, len+1, format, args); vsnprintf(buffer2, len+1, format, args);
buffer2[len] = 0; buffer2[len] = 0;
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
@ -1193,7 +1195,9 @@ void ggml_metal_graph_compute(
const float scale = ((float *) dst->op_params)[0]; const float scale = ((float *) dst->op_params)[0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
if (id_src1) {
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
@ -1511,9 +1515,7 @@ void ggml_metal_graph_compute(
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5];
@ -1523,7 +1525,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:13];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:14];
[encoder setBytes:&idx length:sizeof(idx) atIndex:15]; [encoder setBytes:&idx length:sizeof(idx) atIndex:15];
@ -1538,8 +1540,15 @@ void ggml_metal_graph_compute(
} }
[encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder setThreadgroupMemoryLength:8192 atIndex:0];
for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) {
[encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} }
}
} break; } break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {

53
ggml.c
View File

@ -4083,7 +4083,9 @@ struct ggml_tensor * ggml_mul_mat_id(
int64_t n_as = ids->ne[0]; int64_t n_as = ids->ne[0];
GGML_ASSERT(ids->type == GGML_TYPE_I32); GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_vector(ids)); GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
GGML_ASSERT(ids->ne[1] == b->ne[1]);
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
GGML_ASSERT(id >= 0 && id < n_as); GGML_ASSERT(id >= 0 && id < n_as);
@ -9519,11 +9521,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
} }
#endif #endif
// off1 = offset in i11 and i1
// cne1 = ne11 and ne1
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst,
int64_t off1, int64_t cne1) {
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
@ -9592,9 +9599,8 @@ static void ggml_compute_forward_mul_mat(
const int64_t i02 = i12/r2; const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03; const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) { if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata; float * const wdata = params->wdata;
@ -9611,7 +9617,7 @@ static void ggml_compute_forward_mul_mat(
} }
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10, cne1, ne01, ne10,
1.0f, y, ne10, 1.0f, y, ne10,
x, ne00, x, ne00,
0.0f, d, ne01); 0.0f, d, ne01);
@ -9630,6 +9636,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
assert(params->wsize >= ne11*ne12*ne13*row_size); assert(params->wsize >= ne11*ne12*ne13*row_size);
assert(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -9652,7 +9659,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows const int64_t nr1 = cne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@ -9694,9 +9701,9 @@ static void ggml_compute_forward_mul_mat(
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11)); const int64_t i13 = (ir1/(ne12*cne1));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
// broadcast src0 into src1 // broadcast src0 into src1
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
@ -9736,20 +9743,26 @@ static void ggml_compute_forward_mul_mat(
static void ggml_compute_forward_mul_mat_id( static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0]; if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
const struct ggml_tensor * src1 = dst->src[1]; // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
return;
}
const struct ggml_tensor * ids = src0;
const int id = ggml_get_op_params_i32(dst, 0); const int id = ggml_get_op_params_i32(dst, 0);
const int a_id = ((int32_t *)ids->data)[id]; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]);
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]); const struct ggml_tensor * src0_row = dst->src[row_id + 2];
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
const struct ggml_tensor * src0 = dst->src[a_id + 2]; }
ggml_compute_forward_mul_mat(params, src0, src1, dst);
} }
// ggml_compute_forward_out_prod // ggml_compute_forward_out_prod
@ -14037,11 +14050,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
ggml_compute_forward_mul_mat_id(params, tensor); ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
} break; } break;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:
{ {

View File

@ -770,11 +770,9 @@ struct test_mul_mat_id : public test_case {
const int64_t m; const int64_t m;
const int64_t n; const int64_t n;
const int64_t k; const int64_t k;
const std::array<int64_t, 2> bs; // dims 3 and 4
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
std::string vars() override { std::string vars() override {
return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr); return VARS_TO_STR7(type_a, type_b, n_mats, id, m, n, k);
} }
double max_nmse_err() override { double max_nmse_err() override {
@ -782,7 +780,7 @@ struct test_mul_mat_id : public test_case {
} }
size_t op_size(ggml_tensor * t) override { size_t op_size(ggml_tensor * t) override {
size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1]; size_t a = ggml_nbytes(t->src[2]) * n;
size_t b = ggml_nbytes(t->src[1]) * m; size_t b = ggml_nbytes(t->src[1]) * m;
size_t c = ggml_nbytes(t); size_t c = ggml_nbytes(t);
return a + b + c; return a + b + c;
@ -792,35 +790,37 @@ struct test_mul_mat_id : public test_case {
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
int n_mats = 2, int id = 0, int n_mats = 2, int id = 0,
int64_t m = 32, int64_t n = 32, int64_t k = 32, int64_t m = 32, int64_t n = 32, int64_t k = 32)
std::array<int64_t, 2> bs = {10, 10},
std::array<int64_t, 2> nr = {2, 2})
: type_a(type_a), type_b(type_b), n_mats(n_mats), id(id), : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
m(m), n(n), k(k), bs(bs), nr(nr) {} m(m), n(n), k(k) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
// C^T = A * B^T: (k, m) * (k, n) => (m, n) // C^T = A * B^T: (k, m) * (k, n) => (m, n)
std::vector<ggml_tensor *> mats; std::vector<ggml_tensor *> mats;
for (int i = 0; i < n_mats; i++) { for (int i = 0; i < n_mats; i++) {
ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]); ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
mats.push_back(a); mats.push_back(a);
} }
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b); ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b);
return out; return out;
} }
void initialize_tensors(ggml_context * ctx) override { void initialize_tensors(ggml_context * ctx) override {
std::random_device rd;
std::default_random_engine rng(rd());
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) { if (t->type == GGML_TYPE_I32) {
// ids // ids
std::vector<int> data(n_mats); for (int64_t r = 0; r < ggml_nrows(t); r++) {
for (int i = 0; i < n_mats; i++) { std::vector<int32_t> data(t->ne[0]);
for (int i = 0; i < t->ne[0]; i++) {
data[i] = i; data[i] = i;
} }
std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()())); std::shuffle(data.begin(), data.end(), rng);
ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int)); ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
}
} else { } else {
init_tensor_uniform(t); init_tensor_uniform(t);
} }
@ -1215,7 +1215,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
for (int n_mats : {1, 2, 4}) { for (int n_mats : {1, 2, 4}) {
for (int id = 0; id < n_mats; id++) { for (int id = 0; id < n_mats; id++) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256));
} }
} }
} }