mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-28 04:47:04 +01:00
CUDA: refactor and optimize IQ MMVQ (#8215)
* CUDA: refactor and optimize IQ MMVQ * uint -> uint32_t * __dp4a -> ggml_cuda_dp4a * remove MIN_CC_DP4A checks * change default * try CI fix
This commit is contained in:
parent
dae57a1ebc
commit
cb5fad4c6c
@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
|
|||||||
#define QR6_K 2
|
#define QR6_K 2
|
||||||
|
|
||||||
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
||||||
#define QR2_XXS 8
|
#define QR2_XXS 4
|
||||||
|
|
||||||
#define QI2_XS (QK_K / (4*QR2_XS))
|
#define QI2_XS (QK_K / (4*QR2_XS))
|
||||||
#define QR2_XS 8
|
#define QR2_XS 4
|
||||||
|
|
||||||
#define QI2_S (QK_K / (4*QR2_S))
|
#define QI2_S (QK_K / (4*QR2_S))
|
||||||
#define QR2_S 8
|
#define QR2_S 4
|
||||||
|
|
||||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||||
#define QR3_XXS 8
|
#define QR3_XXS 4
|
||||||
|
|
||||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||||
#define QR3_XS 8
|
#define QR3_XS 4
|
||||||
|
|
||||||
#define QI1_S (QK_K / (4*QR1_S))
|
#define QI1_S (QK_K / (4*QR1_S))
|
||||||
#define QR1_S 8
|
#define QR1_S 8
|
||||||
@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
|
|||||||
#define QR4_NL 2
|
#define QR4_NL 2
|
||||||
|
|
||||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||||
#define QR4_XS 8
|
#define QR4_XS 2
|
||||||
|
|
||||||
#define QI3_S (QK_K / (4*QR3_S))
|
#define QI3_S (QK_K / (4*QR3_S))
|
||||||
#define QR3_S 8
|
#define QR3_S 4
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||||
|
|
||||||
|
@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
#endif // GGML_CUDA_FORCE_DMMV
|
||||||
|
|
||||||
bool any_gpus_with_slow_fp16 = false;
|
bool any_gpus_with_slow_fp16 = false;
|
||||||
|
|
||||||
if (split) {
|
if (split) {
|
||||||
@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
|
||||||
#ifndef GGML_CUDA_FORCE_DMMV
|
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
||||||
#endif // GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
#elif defined(RDNA3)
|
unsigned int c;
|
||||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
#pragma unroll
|
||||||
int tmp1;
|
for (int i = 0; i < 4; ++i) {
|
||||||
int tmp2;
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
asm("\n \
|
}
|
||||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
|
||||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
|
||||||
v_add3_u32 %0, %1, %2, %0 \n \
|
|
||||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
|
||||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
|
||||||
v_add3_u32 %0, %1, %2, %0 \n \
|
|
||||||
"
|
|
||||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
|
||||||
: "v"(a), "v"(b)
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
||||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
||||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
|
||||||
#endif
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|||||||
}
|
}
|
||||||
#endif // CUDART_VERSION < 12000
|
#endif // CUDART_VERSION < 12000
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||||
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||||
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||||
|
int tmp1;
|
||||||
|
int tmp2;
|
||||||
|
asm("\n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
"
|
||||||
|
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||||
|
: "v"(a), "v"(b)
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||||
|
#endif
|
||||||
|
return c;
|
||||||
|
|
||||||
|
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
return __dp4a(a, b, c);
|
||||||
|
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
const int8_t * a8 = (const int8_t *) &a;
|
||||||
|
const int8_t * b8 = (const int8_t *) &b;
|
||||||
|
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||||
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||||
|
|
||||||
|
@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
|
|||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
half sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||||
@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|||||||
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|||||||
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|||||||
|
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|||||||
|
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D>
|
||||||
|
@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|||||||
|
|
||||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
|
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||||
1;
|
1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -735,7 +735,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -760,7 +760,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -785,7 +785,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
|
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -810,7 +810,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -834,7 +834,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -924,7 +924,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
|
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||||
|
|
||||||
#if QR2_XXS == 8
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||||
@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|||||||
}
|
}
|
||||||
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
||||||
return d * sumi;
|
return d * sumi;
|
||||||
#else
|
|
||||||
// iqs is 0...15
|
|
||||||
const int ib32 = iqs/2;
|
|
||||||
const int il = iqs%2;
|
|
||||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
|
||||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
||||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
|
||||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
|
||||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
||||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f;
|
|
||||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
|
||||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
|
||||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
|
||||||
int sumi1 = 0, sumi2 = 0;
|
|
||||||
for (int j = 0; j < 8; ++j) {
|
|
||||||
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
|
||||||
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
|
||||||
}
|
|
||||||
return d * (sumi1 + sumi2);
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
|
Loading…
Reference in New Issue
Block a user