ggml : multi-thread ggml_rope() (~3-4 times faster on M1) (#781)

This commit is contained in:
Georgi Gerganov 2023-04-05 22:11:03 +03:00 committed by GitHub
parent 986b6ce9f9
commit eeaa7b0492
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

41
ggml.c
View File

@ -7238,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3); assert(ggml_nelements(src1) == 3);
@ -7265,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
assert(nb0 == sizeof(float)); assert(nb0 == sizeof(float));
// TODO: optimize const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
const int p = (mode == 0 ? n_past + i2 : i2); const int p = (mode == 0 ? n_past + i2 : i2);
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
for (int i0 = 0; i0 < n_dims; i0 += 2) { for (int i0 = 0; i0 < n_dims; i0 += 2) {
const float theta = powf(10000.0, ((float)-i0)/n_dims); const float theta = powf(10000.0, ((float)-i0)/n_dims);
@ -7295,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3); assert(ggml_nelements(src1) == 3);
@ -7322,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
assert(nb0 == sizeof(ggml_fp16_t)); assert(nb0 == sizeof(ggml_fp16_t));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
const int p = (mode == 0 ? n_past + i2 : i2); const int p = (mode == 0 ? n_past + i2 : i2);
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
for (int i0 = 0; i0 < n_dims; i0 += 2) { for (int i0 = 0; i0 < n_dims; i0 += 2) {
const float theta = powf(10000.0, ((float)-i0)/n_dims); const float theta = powf(10000.0, ((float)-i0)/n_dims);
@ -9424,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
node->n_tasks = 1; node->n_tasks = n_threads;
} break; } break;
case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S: case GGML_OP_CONV_1D_2S: