mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 12:21:40 +01:00
Merge branch 'ggerganov:master' into bitnet
This commit is contained in:
commit
c0cd08d45e
@ -2,4 +2,4 @@
|
||||
- [ ] Review Complexity : Low
|
||||
- [ ] Review Complexity : Medium
|
||||
- [ ] Review Complexity : High
|
||||
- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
|
||||
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
9
.github/workflows/build.yml
vendored
9
.github/workflows/build.yml
vendored
@ -13,7 +13,7 @@ on:
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@ -684,7 +684,7 @@ jobs:
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2019
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
@ -829,7 +829,7 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@ -843,8 +843,9 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
|
@ -576,7 +576,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
vulkaninfo
|
||||
```
|
||||
|
||||
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
|
||||
|
@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
|
||||
if (field == "n_gpu_layers") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "n_threads") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_batch") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_ubatch") {
|
||||
return 8;
|
||||
}
|
||||
if (field == "type_k" || field == "type_v") {
|
||||
return 6;
|
||||
}
|
||||
if (field == "split_mode") {
|
||||
return 5;
|
||||
}
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
}
|
||||
|
@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
#endif
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free(*buffers[i]);
|
||||
ggml_backend_buffer_free((*buffers)[i]);
|
||||
}
|
||||
free(*buffers);
|
||||
return false;
|
||||
|
@ -1,5 +1,27 @@
|
||||
#include "common.cuh"
|
||||
|
||||
struct mma_int_A_I16K4 {
|
||||
static constexpr int I = 16;
|
||||
static constexpr int K = 4;
|
||||
static constexpr int ne = 2;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
const int ret = (l%2) * (I/2) + threadIdx.x / K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < I);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int /* l */) {
|
||||
const int ret = threadIdx.x % K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_A_I16K8 {
|
||||
static constexpr int I = 16;
|
||||
static constexpr int K = 8;
|
||||
@ -22,6 +44,28 @@ struct mma_int_A_I16K8 {
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K4 {
|
||||
static constexpr int J = 8;
|
||||
static constexpr int K = 4;
|
||||
static constexpr int ne = 1;
|
||||
|
||||
int x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int /* l */) {
|
||||
const int ret = threadIdx.x / K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < J);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_k(const int /* l */) {
|
||||
const int ret = threadIdx.x % K;
|
||||
GGML_CUDA_ASSUME(ret >= 0);
|
||||
GGML_CUDA_ASSUME(ret < K);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K8 {
|
||||
static constexpr int J = 8;
|
||||
static constexpr int K = 8;
|
||||
@ -65,6 +109,28 @@ struct mma_int_C_I16J8 {
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
#else
|
||||
// On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
#endif // __CUDA_ARCH__ >= CC_AMPERE
|
||||
#else
|
||||
GGML_UNUSED(mma_A);
|
||||
GGML_UNUSED(mma_B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
|
@ -1089,7 +1089,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
|
||||
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
@ -1115,6 +1115,97 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
||||
|
||||
typedef mma_int_A_I16K8 mma_A;
|
||||
typedef mma_int_B_J8K8 mma_B;
|
||||
typedef mma_int_C_I16J8 mma_C;
|
||||
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = threadIdx.y*mma_A::I;
|
||||
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
|
||||
|
||||
mma_A A[2];
|
||||
int scA[mma_C::ne/2][2];
|
||||
int mA[mma_C::ne/2][2];
|
||||
half2 dmA[mma_C::ne/2];
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q4_K_Q8_1_MMQ; kvdr += 4) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_A::ne; ++l) {
|
||||
const int i = i0 + mma_A::get_i(l);
|
||||
const int k = k0 + mma_A::get_k(l);
|
||||
|
||||
A[kvdr/4].x[l] = (x_ql[i*(WARP_SIZE + 1) + k] >> kvdr) & 0x0F0F0F0F;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
|
||||
const uint8_t * m = sc + 8;
|
||||
|
||||
scA[l][kvdr/4] = sc[kvdr/4];
|
||||
mA[l][kvdr/4] = m[kvdr/4];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
|
||||
float tmpd[mma_C::ne] = {0.0f};
|
||||
float tmpm[mma_C::ne] = {0.0f};
|
||||
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
|
||||
mma_C C;
|
||||
mma_B B;
|
||||
half2 dsB[mma_C::ne/2];
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_B::ne; ++l) {
|
||||
const int j = j0 + mma_B::get_j(l);
|
||||
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
|
||||
|
||||
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int j = j0 + mma_C::get_j(l);
|
||||
|
||||
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
|
||||
}
|
||||
|
||||
C.mma_K8(A[kvdr/4], B);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
|
||||
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
||||
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
|
||||
@ -1188,7 +1279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
|
||||
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
@ -1214,6 +1305,97 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mma(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
||||
|
||||
typedef mma_int_A_I16K8 mma_A;
|
||||
typedef mma_int_B_J8K8 mma_B;
|
||||
typedef mma_int_C_I16J8 mma_C;
|
||||
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = threadIdx.y*mma_A::I;
|
||||
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
|
||||
|
||||
mma_A A[2];
|
||||
int scA[mma_C::ne/2][2];
|
||||
int mA[mma_C::ne/2][2];
|
||||
half2 dmA[mma_C::ne/2];
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_A::ne; ++l) {
|
||||
const int i = i0 + mma_A::get_i(l);
|
||||
const int k = QR5_K*k0 + QR5_K*kvdr + mma_A::get_k(l);
|
||||
|
||||
A[kvdr/4].x[l] = x_ql[i*(QR5_K*WARP_SIZE + 1) + k];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
|
||||
const uint8_t * m = sc + 8;
|
||||
|
||||
scA[l][kvdr/4] = sc[kvdr/4];
|
||||
mA[l][kvdr/4] = m[kvdr/4];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
|
||||
float tmpd[mma_C::ne] = {0.0f};
|
||||
float tmpm[mma_C::ne] = {0.0f};
|
||||
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
|
||||
mma_C C;
|
||||
mma_B B;
|
||||
half2 dsB[mma_C::ne/2];
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_B::ne; ++l) {
|
||||
const int j = j0 + mma_B::get_j(l);
|
||||
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
|
||||
|
||||
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int j = j0 + mma_C::get_j(l);
|
||||
|
||||
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
|
||||
}
|
||||
|
||||
C.mma_K8(A[kvdr/4], B);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
|
||||
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
||||
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
|
||||
@ -1280,7 +1462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
@ -1307,6 +1489,97 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
|
||||
|
||||
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
||||
|
||||
typedef mma_int_A_I16K4 mma_A;
|
||||
typedef mma_int_B_J8K4 mma_B;
|
||||
typedef mma_int_C_I16J8 mma_C;
|
||||
|
||||
const float * x_df = (const float *) x_dm;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = threadIdx.y*mma_A::I;
|
||||
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
|
||||
|
||||
mma_A A[4];
|
||||
int scA[mma_C::ne/2][4];
|
||||
float dA[mma_C::ne/2];
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_A::ne; ++l) {
|
||||
const int i = i0 + mma_A::get_i(l);
|
||||
const int k = QR6_K*k0 + QR6_K*kvdr + mma_A::get_k(l);
|
||||
|
||||
A[kvdr/2 + 0].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + 0];
|
||||
A[kvdr/2 + 1].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + mma_A::K];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
|
||||
|
||||
scA[l][kvdr/2 + 0] = sc[kvdr/2 + 0];
|
||||
scA[l][kvdr/2 + 1] = sc[kvdr/2 + 1];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int i = i0 + mma_C::get_i(2*l);
|
||||
|
||||
dA[l] = x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + k0/QI6_K];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
|
||||
float tmp[mma_C::ne] = {0.0f};
|
||||
|
||||
#pragma unroll
|
||||
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
|
||||
mma_C C[2];
|
||||
mma_B B[2];
|
||||
float dB[mma_C::ne/2];
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_B::ne; ++l) {
|
||||
const int j = j0 + mma_B::get_j(l);
|
||||
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
|
||||
|
||||
B[0].x[l] = y_qs[j*MMQ_TILE_Y_K + k + 0];
|
||||
B[1].x[l] = y_qs[j*MMQ_TILE_Y_K + k + mma_B::K];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne/2; ++l) {
|
||||
const int j = j0 + mma_C::get_j(l);
|
||||
|
||||
dB[l] = y_df[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
|
||||
}
|
||||
|
||||
C[0].mma_K4(A[kvdr/2 + 0], B[0]);
|
||||
C[1].mma_K4(A[kvdr/2 + 1], B[1]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
tmp[l] += (C[0].x[l]*scA[l/2][kvdr/2 + 0] + C[1].x[l]*scA[l/2][kvdr/2 + 1])*dB[l%2];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < mma_C::ne; ++l) {
|
||||
sum[(j0/mma_B::J)*mma_C::ne + l] += tmp[l]*dA[l/2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
|
||||
#pragma unroll
|
||||
@ -1448,24 +1721,39 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
|
||||
static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#else
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
|
||||
static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#else
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
|
||||
static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#else
|
||||
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
|
||||
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
};
|
||||
|
||||
static int mmq_need_sum(const ggml_type type_x) {
|
||||
|
File diff suppressed because it is too large
Load Diff
175
ggml-vulkan.cpp
175
ggml-vulkan.cpp
@ -1,5 +1,5 @@
|
||||
#include "ggml-vulkan.h"
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
#ifdef GGML_VULKAN_RUN_TESTS
|
||||
#include <chrono>
|
||||
#endif
|
||||
@ -9,12 +9,13 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
@ -150,7 +151,7 @@ struct vk_device {
|
||||
vk_pipeline pipeline_relu_f32;
|
||||
vk_pipeline pipeline_diag_mask_inf_f32;
|
||||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||
vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
|
||||
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
||||
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
||||
vk_pipeline pipeline_argsort_f32;
|
||||
vk_pipeline pipeline_sum_rows_f32;
|
||||
@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
|
||||
|
||||
struct vk_op_rope_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t n_dims;
|
||||
float freq_scale;
|
||||
uint32_t p_delta_rows;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[4];
|
||||
};
|
||||
|
||||
struct vk_op_rope_neox_push_constants {
|
||||
uint32_t ncols;
|
||||
uint32_t ndims;
|
||||
float freq_scale;
|
||||
uint32_t p_delta_rows;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[4];
|
||||
float corr_dims[2];
|
||||
float theta_scale;
|
||||
float inv_ndims;
|
||||
uint32_t has_freq_facs;
|
||||
uint32_t has_ff;
|
||||
};
|
||||
|
||||
struct vk_op_soft_max_push_constants {
|
||||
@ -1534,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
@ -1566,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceMaintenance3Properties props3;
|
||||
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
||||
vk::PhysicalDeviceDriverProperties driver_props;
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
subgroup_props.pNext = &driver_props;
|
||||
physical_device.getProperties2(&props2);
|
||||
|
||||
const size_t subgroup_size = subgroup_props.subgroupSize;
|
||||
@ -1611,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
||||
@ -1707,7 +1699,78 @@ void ggml_vk_instance_init() {
|
||||
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
||||
|
||||
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
||||
vk_instance.device_indices.push_back(i);
|
||||
// Check if there are two physical devices corresponding to the same GPU
|
||||
auto old_device = std::find_if(
|
||||
vk_instance.device_indices.begin(),
|
||||
vk_instance.device_indices.end(),
|
||||
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
|
||||
);
|
||||
if (old_device == vk_instance.device_indices.end()) {
|
||||
vk_instance.device_indices.push_back(i);
|
||||
} else {
|
||||
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
||||
// This can cause error when splitting layers aross the devices, need to keep only 1
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
||||
#endif
|
||||
|
||||
vk::PhysicalDeviceProperties2 old_prop;
|
||||
vk::PhysicalDeviceDriverProperties old_driver;
|
||||
old_prop.pNext = &old_driver;
|
||||
devices[*old_device].getProperties2(&old_prop);
|
||||
|
||||
vk::PhysicalDeviceProperties2 new_prop;
|
||||
vk::PhysicalDeviceDriverProperties new_driver;
|
||||
new_prop.pNext = &new_driver;
|
||||
devices[i].getProperties2(&new_prop);
|
||||
|
||||
std::map<vk::DriverId, int> driver_priorities {};
|
||||
int old_priority = std::numeric_limits<int>::max();
|
||||
int new_priority = std::numeric_limits<int>::max();
|
||||
|
||||
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
||||
// Smaller number -> higher priority
|
||||
switch (old_prop.properties.vendorID) {
|
||||
case VK_VENDOR_ID_AMD:
|
||||
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
||||
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
||||
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
||||
break;
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
||||
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
||||
break;
|
||||
case VK_VENDOR_ID_NVIDIA:
|
||||
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
||||
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
||||
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
if (driver_priorities.count(old_driver.driverID)) {
|
||||
old_priority = driver_priorities[old_driver.driverID];
|
||||
}
|
||||
if (driver_priorities.count(new_driver.driverID)) {
|
||||
new_priority = driver_priorities[new_driver.driverID];
|
||||
}
|
||||
|
||||
if (new_priority < old_priority) {
|
||||
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
||||
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
||||
vk_instance.device_indices.push_back(i);
|
||||
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
||||
#endif
|
||||
}
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
else {
|
||||
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3905,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_rope_f32;
|
||||
return ctx->device->pipeline_rope_norm_f32;
|
||||
}
|
||||
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_rope_f16;
|
||||
return ctx->device->pipeline_rope_norm_f16;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
@ -4152,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (op == GGML_OP_ROPE) {
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
if (is_neox) {
|
||||
// Empty src2 is possible in rope, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_z;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
subbuf_z = { d_X, 0, d_X->size };
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
// Empty src2 is possible in rope, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_z;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
subbuf_z = { d_X, 0, d_X->size };
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (use_src2) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
@ -4391,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
|
||||
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
// const int mode = ((int32_t *) dst->op_params)[2];
|
||||
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||
const float freq_base = ((float *) dst->op_params)[5];
|
||||
@ -4401,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||
const float beta_fast = ((float *) dst->op_params)[9];
|
||||
const float beta_slow = ((float *) dst->op_params)[10];
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
||||
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
if (is_neox) {
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.0f / n_dims;
|
||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
||||
src2 != nullptr,
|
||||
});
|
||||
} else {
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
||||
});
|
||||
}
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
||||
src2 != nullptr,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
@ -6070,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
||||
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
||||
#endif
|
||||
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
||||
|
||||
vk_buffer dev_buffer = nullptr;
|
||||
try {
|
||||
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
||||
} catch (const vk::SystemError& e) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
||||
|
||||
@ -6466,7 +6515,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
// } break;
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
|
@ -2400,7 +2400,7 @@ void main() {
|
||||
"""
|
||||
|
||||
# ROPE
|
||||
rope_src = """
|
||||
rope_norm_src = """
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
@ -2408,17 +2408,21 @@ rope_src = """
|
||||
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_pos[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_ff[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
uint n_dims;
|
||||
float freq_scale;
|
||||
uint p_delta_rows;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[4];
|
||||
float corr_dims[2];
|
||||
float theta_scale;
|
||||
uint has_ff;
|
||||
} p;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
||||
@ -2450,14 +2454,24 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (col >= p.n_dims) {
|
||||
const uint i = row*p.ncols + col;
|
||||
|
||||
data_d[i + 0] = data_a[i + 0];
|
||||
data_d[i + 1] = data_a[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = row*p.ncols + col;
|
||||
const uint i2 = row/p.p_delta_rows;
|
||||
|
||||
const int pos = data_b[i2];
|
||||
const float theta_base = pos * pow(p.freq_base, -float(col)/p.ncols);
|
||||
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
|
||||
|
||||
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, col, cos_theta, sin_theta);
|
||||
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
|
||||
|
||||
const float x0 = float(data_a[i + 0]);
|
||||
const float x1 = float(data_a[i + 1]);
|
||||
@ -2475,22 +2489,21 @@ rope_neox_src = """
|
||||
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_b[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_freq_factors[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_pos[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_ff[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
uint ndims;
|
||||
uint n_dims;
|
||||
float freq_scale;
|
||||
uint p_delta_rows;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[4];
|
||||
float corr_dims[2];
|
||||
float theta_scale;
|
||||
float inv_ndims;
|
||||
uint has_freq_facs;
|
||||
uint has_ff;
|
||||
} p;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
||||
@ -2522,11 +2535,8 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint ib = col / p.ndims;
|
||||
const uint ic = col % p.ndims;
|
||||
|
||||
if (ib > 0) {
|
||||
const uint i = row*p.ncols + ib*p.ndims + ic;
|
||||
if (col >= p.n_dims) {
|
||||
const uint i = row*p.ncols + col;
|
||||
|
||||
data_d[i + 0] = data_a[i + 0];
|
||||
data_d[i + 1] = data_a[i + 1];
|
||||
@ -2534,29 +2544,27 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = row*p.ncols + ib*p.ndims + ic/2;
|
||||
const uint i = row*p.ncols + col/2;
|
||||
const uint i2 = row/p.p_delta_rows;
|
||||
|
||||
const int pos = data_b[i2];
|
||||
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
||||
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
|
||||
|
||||
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, ic, cos_theta, sin_theta);
|
||||
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
|
||||
|
||||
const float x0 = float(data_a[i + 0]);
|
||||
const float x1 = float(data_a[i + p.ndims/2]);
|
||||
const float x1 = float(data_a[i + p.n_dims/2]);
|
||||
|
||||
data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
|
||||
data_d[i + p.ndims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
|
||||
data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
"""
|
||||
|
||||
argsort_src = """
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#define BLOCK_SIZE 1024
|
||||
#define ASC 0
|
||||
|
||||
@ -3039,8 +3047,8 @@ async def main():
|
||||
tasks.append(string_to_spv("soft_max_f32", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "C_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("soft_max_f32_f16", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float16_t", "C_TYPE": "float16_t", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("rope_norm_f32", rope_norm_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_norm_f16", rope_norm_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
@ -870,7 +870,7 @@ int main() {
|
||||
}
|
||||
});
|
||||
|
||||
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
|
||||
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
|
||||
test_all("Python", [](const TestCase & tc) {
|
||||
write("test-json-schema-input.tmp", tc.schema);
|
||||
tc.verify_status(std::system(
|
||||
@ -878,7 +878,7 @@ int main() {
|
||||
tc.verify(read("test-grammar-output.tmp"));
|
||||
});
|
||||
} else {
|
||||
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
|
||||
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
|
||||
}
|
||||
|
||||
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
|
||||
|
Loading…
Reference in New Issue
Block a user