mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 00:39:00 +01:00
llama : remove experimental stuff
This commit is contained in:
parent
e343b8b4d8
commit
e7e7b11455
@ -925,9 +925,7 @@ void ggml_metal_graph_compute(
|
|||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
if (ne11 * ne12 < 4) {
|
if (ne11 * ne12 < 4) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||||
//} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||||
} else if (false) {
|
|
||||||
// TODO: with ggml_mul_mat_pad this kernel no longer seems to be needed
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
||||||
nrows = ne11;
|
nrows = ne11;
|
||||||
} else {
|
} else {
|
||||||
|
44
llama.cpp
44
llama.cpp
@ -438,50 +438,6 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|||||||
ggml_graph_compute(graph, &plan);
|
ggml_graph_compute(graph, &plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
//// EXPERIMENTAL:
|
|
||||||
////
|
|
||||||
//// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
||||||
//// the idea is to represent the original matrix multiplication:
|
|
||||||
////
|
|
||||||
//// Z = X @ Y
|
|
||||||
////
|
|
||||||
//// with the sum of two matrix multiplications:
|
|
||||||
////
|
|
||||||
//// Z = (X_0 @ Y_0) + (X_1 @ Y_1)
|
|
||||||
////
|
|
||||||
//// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
|
|
||||||
//// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
|
|
||||||
//// general-purpose kernels
|
|
||||||
////
|
|
||||||
//static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
|
|
||||||
////#if !defined(GGML_USE_METAL)
|
|
||||||
//// return ggml_mul_mat(ctx, x, y);
|
|
||||||
////#endif
|
|
||||||
//
|
|
||||||
// // use padding only if dimension 0 is at least 8 times larger than the padding
|
|
||||||
// // else we won't get much benefit from the optimization
|
|
||||||
// const int n_pad_req = 8;
|
|
||||||
//
|
|
||||||
// if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
|
|
||||||
// return ggml_mul_mat(ctx, x, y);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
|
|
||||||
// struct ggml_tensor * x_1 = ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
|
|
||||||
//
|
|
||||||
// struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
|
|
||||||
// struct ggml_tensor * y_1 = ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
|
|
||||||
//
|
|
||||||
// return ggml_add(ctx,
|
|
||||||
// ggml_mul_mat(ctx, x_0, y_0),
|
|
||||||
// ggml_mul_mat(ctx, x_1, y_1));
|
|
||||||
//}
|
|
||||||
//
|
|
||||||
//// TODO: check if other backends benefit from this and enable for all
|
|
||||||
//#if defined(GGML_USE_METAL)
|
|
||||||
////#define ggml_mul_mat ggml_mul_mat_pad
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama helpers
|
// llama helpers
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user