llama.cpp/ggml-cuda.cu

#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"

#if defined(GGML_USE_HIPBLAS)
#define GGML_COMMON_DECL_HIP
#define GGML_COMMON_IMPL_HIP
#else
#define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA
#endif
#include "ggml-common.h"

#include <algorithm>
#include <assert.h>
#include <atomic>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <float.h>
#include <limits>
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <vector>
#include <map>
#include <array>

// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
#else
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#endif
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define __trap abort
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#else
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020

#endif // defined(GGML_USE_HIPBLAS)

#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

#define CC_PASCAL     600
#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA      700
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
#define CC_RDNA3      (CC_OFFSET_AMD + 1100)

#define GGML_CUDA_MAX_NODES 8192

// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// -  7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ

// TODO: improve this to be correct for more hardware
//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif

#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available

#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int &>(c);
#else
    int8x4_t c;
    int16_t tmp;
#pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int &>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
}

static __device__ __forceinline__ int __vsub4(const int a, const int b) {
    return __vsubss4(a, b);
}

static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
}

static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        "
        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
        : "v"(a), "v"(b)
    );
#else
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
    return c;
}
#endif // defined(GGML_USE_HIPBLAS)

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

[[noreturn]]
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
    cudaGetDevice(&id);

    fprintf(stderr, "CUDA error: %s\n", msg);
    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
    fprintf(stderr, "  %s\n", stmt);
    // abort with GGML_ASSERT to get a stack trace
    GGML_ASSERT(!"CUDA error");
}

#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
     do {                                                                           \
        auto err_ = (err);                                                          \
        if (err_ != (success)) {                                                    \
            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
        }                                                                           \
    } while (0)

#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)

#if CUDART_VERSION >= 12000
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        return cublasGetStatusString(err);
    }
#else
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        switch (err) {
            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
            default: return "unknown error";
        }
    }
#endif // CUDART_VERSION >= 12000

#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)

#if !defined(GGML_USE_HIPBLAS)
static const char * cu_get_error_str(CUresult err) {
    const char * err_str;
    cuGetErrorString(err, &err_str);
    return err_str;
}
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif

#if CUDART_VERSION >= 11100
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100

#ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float
typedef half2 dfloat2;
#else
typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16

static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

    int x32 = 0;
    x32 |= x16[0] <<  0;
    x32 |= x16[1] << 16;

    return x32;
}

static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

    int x32 = 0;
    x32 |= x16[0] <<  0;
    x32 |= x16[1] << 16;

    return x32;
}

static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}

static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}

template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;

typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
typedef void (*ggml_cuda_op_mul_mat_t)(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);
typedef void (*ggml_cuda_op_flatten_t)(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);

typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
typedef void (*load_tiles_cuda_t)(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
typedef float (*vec_dot_q_mul_mat_cuda_t)(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);

#define WARP_SIZE 32
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
#define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
#define CUDA_CLAMP_BLOCK_SIZE 256
#define CUDA_ROPE_BLOCK_SIZE 256
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
#define CUDA_ALIBI_BLOCK_SIZE 32
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_UPSCALE_BLOCK_SIZE 256
#define CUDA_CONCAT_BLOCK_SIZE 256
#define CUDA_PAD_BLOCK_SIZE 256
#define CUDA_ARANGE_BLOCK_SIZE 256
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
#define CUDA_ACC_BLOCK_SIZE 256
#define CUDA_IM2COL_BLOCK_SIZE 256
#define CUDA_POOL2D_BLOCK_SIZE 256

#define CUDA_Q8_0_NE_ALIGN 2048

// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_MMV_Y
#define GGML_CUDA_MMV_Y 1
#endif

#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 2
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif

#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE

#define MUL_MAT_SRC1_COL_STRIDE 128

#define MAX_STREAMS 8
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };

struct ggml_tensor_extra_gpu {
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
};

// this is faster on Windows
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
static void ggml_cuda_set_device(const int device) {
    int current_device;
    CUDA_CHECK(cudaGetDevice(&current_device));

    if (device == current_device) {
        return;
    }

    CUDA_CHECK(cudaSetDevice(device));
}

static int g_device_count = -1;
static int g_main_device = 0;
static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};

struct cuda_device_capabilities {
    int     cc;                 // compute capability
    size_t  smpb;               // max. shared memory per block
    bool    vmm;                // virtual memory support
    size_t  vmm_granularity;    // granularity of virtual memory
};

static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };

static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

[[noreturn]]
static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    (void) arch_list;
#else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

    (void) no_device_code; // suppress unused function warning
}

#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__

static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
}

static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
}

#ifdef GGML_CUDA_F16
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
   }
   return a;
#else
   (void) a;
   NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#endif // GGML_CUDA_F16

static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
    }
    return x;
}

//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//#pragma unroll
//    for (int mask = 16; mask > 0; mask >>= 1) {
//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
//    }
//    return x;
//#else
//    (void) x;
//    NO_DEVICE_CODE;
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//}

static __device__ __forceinline__ float op_repeat(const float a, const float b) {
    return b;
    GGML_UNUSED(a);
}

static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
}

static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
}

static __device__ __forceinline__ float op_div(const float a, const float b) {
    return a / b;
}

template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s10,*/ int s11, int s12, int s13) {
    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;

    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
        return;
    }

    const int i11 = i1 % ne11;
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
    const size_t i_dst  = i_src0;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
    dst_t * dst_row = dst + i_dst;

    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
        const int i10 = i0 % ne10;
        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
    }
}

template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s10,*/ int s11, int s12, int s13) {

    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    const int i3 = i/(ne2*ne1*ne0);
    const int i2 = (i/(ne1*ne0)) % ne2;
    const int i1 = (i/ne0) % ne1;
    const int i0 = i % ne0;

    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
        return;
    }

    const int i11 = i1 % ne11;
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
    const size_t i_dst  = i_src0;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
    dst_t * dst_row = dst + i_dst;

    const int i10 = i0 % ne10;
    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}

static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
    const int ne10, const int ne11, const int ne12,
    const int nb1, const int nb2, int offset) {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
    }
    int src1_idx = i - offset;
    int oz = src1_idx / nb2;
    int oy = (src1_idx - (oz * nb2)) / nb1;
    int ox = src1_idx % nb1;
    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
    } else {
        dst[i] = x[i];
    }
}

static __global__ void gelu_f32(const float * x, float * dst, const int k) {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    float xi = x[i];
    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
}

static __global__ void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] / (1.0f + expf(-x[i]));
}

static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
    const float GELU_QUICK_COEF = -1.702f;
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
}

static __global__ void tanh_f32(const float * x, float * dst, int k) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = tanhf(x[i]);
}

static __global__ void relu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = fmaxf(x[i], 0);
}

static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
}

static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
}

static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
}

static __global__ void sqr_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }
    dst[i] = x[i] * x[i];
}

template <int block_size>
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    float2 mean_var = make_float2(0.f, 0.f);

    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        mean_var.x += xi;
        mean_var.y += xi * xi;
    }

    // sum up partial sums
    mean_var = warp_reduce_sum(mean_var);
    if (block_size > WARP_SIZE) {
        __shared__ float2 s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = mean_var;
        }
        __syncthreads();
        mean_var = s_sum[lane_id];
        mean_var = warp_reduce_sum(mean_var);
    }

    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
    const float inv_std = rsqrtf(var + eps);

    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
    }
}

static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    // operation
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    if (blockIdx.z < ne02) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
            blockIdx.z * ne0 * gridDim.y;
        dst[offset_dst] = x[offset_src];
    } else {
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
            (blockIdx.z - ne02) * ne0 *  gridDim.y;
        dst[offset_dst] = y[offset_src];
    }
}

static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
    // blockIdx.z: idx of ne02*ne03
    // blockIdx.y: idx of ne01*scale_factor， aka ne1
    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
    // ne00xne01: ne00 * ne01
    int ne0 = ne00 * scale_factor;
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    // operation
    int i00 = nidx / scale_factor;
    int i01 = blockIdx.y / scale_factor;
    int offset_src =
        i00 +
        i01 * ne00 +
        blockIdx.z * ne00xne01;
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    dst[offset_dst] = x[offset_src];
}

static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
    // blockIdx.y: idx of ne1
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }

    // operation
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
        int offset_src =
            nidx +
            blockIdx.y * ne00 +
            blockIdx.z * ne00 * ne01;
        dst[offset_dst] = x[offset_src];
    } else {
        dst[offset_dst] = 0.0f;
    }
}

static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    dst[nidx] = start + step * nidx;
}

static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
    // blockIDx.y: idx of timesteps->ne[0]
    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
    int i = blockIdx.y;
    int j = threadIdx.x + blockIdx.x * blockDim.x;
    float * embed_data = (float *)((char *)dst +  i*nb1);

    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
        embed_data[dim] = 0.f;
    }

    int half = dim / 2;
    if (j >= half) {
        return;
    }

    float timestep = timesteps[i];
    float freq = (float)expf(-logf(max_period) * j / half);
    float arg = timestep * freq;
    embed_data[j] = cosf(arg);
    embed_data[j + half] = sinf(arg);
}

template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    // blockIdx.x: num_groups idx
    // threadIdx.x: block_size idx
    int start = blockIdx.x * group_size;
    int end = start + group_size;

    start += threadIdx.x;

    if (end >= ne_elements) {
        end = ne_elements;
    }

    float tmp = 0.0f; // partial sum for thread in warp

    for (int j = start; j < end; j += block_size) {
        tmp += x[j];
    }

    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    float mean = tmp / group_size;
    tmp = 0.0f;

    for (int j = start; j < end; j += block_size) {
        float xi = x[j] - mean;
        dst[j] = xi;
        tmp += xi * xi;
    }

    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    float variance = tmp / group_size;
    float scale = rsqrtf(variance + eps);
    for (int j = start; j < end; j += block_size) {
        dst[j] *= scale;
    }
}

template <int block_size>
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    float tmp = 0.0f; // partial sum for thread in warp

    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }

    // sum up partial sums
    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);

    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
}

static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_0 * x = (const block_q4_0 *) vx;

    const dfloat d = x[ib].d;

    const int vui = x[ib].qs[iqs];

    v.x = vui & 0xF;
    v.y = vui >> 4;

#ifdef GGML_CUDA_F16
    v = __hsub2(v, {8.0f, 8.0f});
    v = __hmul2(v, {d, d});
#else
    v.x = (v.x - 8.0f) * d;
    v.y = (v.y - 8.0f) * d;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
    const dfloat m = __high2half(x[ib].dm);

    const int vui = x[ib].qs[iqs];

    v.x = vui & 0xF;
    v.y = vui >> 4;

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
    v = __hadd2(v, {m, m});
#else
    v.x = (v.x * d) + m;
    v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_0 * x = (const block_q5_0 *) vx;

    const dfloat d = x[ib].d;

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));

    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

#ifdef GGML_CUDA_F16
    v = __hsub2(v, {16.0f, 16.0f});
    v = __hmul2(v, {d, d});
#else
    v.x = (v.x - 16.0f) * d;
    v.y = (v.y - 16.0f) * d;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
    const dfloat m = __high2half(x[ib].dm);

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));

    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
    v = __hadd2(v, {m, m});
#else
    v.x = (v.x * d) + m;
    v.y = (v.y * d) + m;
#endif // GGML_CUDA_F16
}

static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q8_0 * x = (const block_q8_0 *) vx;

    const dfloat d = x[ib].d;

    v.x = x[ib].qs[iqs + 0];
    v.y = x[ib].qs[iqs + 1];

#ifdef GGML_CUDA_F16
    v = __hmul2(v, {d, d});
#else
    v.x *= d;
    v.y *= d;
#endif // GGML_CUDA_F16
}

template<typename dst_t>
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

    const int i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }

    dst_t * y = yy + 256*i + 32*ir + 4*il;

    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
    const float d = __half2float(x->d);
    const float dm = -8*d;

    const uint8_t * q = x->qs + 4*il;

    for (int l = 0; l < 4; ++l) {
        y[l+ 0] = d * (q[l] & 0xF) + dm;
        y[l+16] = d * (q[l] >>  4) + dm;
    }
}

template<typename dst_t>
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

    const int i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }

    dst_t * y = yy + 256*i + 32*ir + 4*il;

    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
    const float2 d = __half22float2(x->dm);

    const uint8_t * q = x->qs + 4*il;

    for (int l = 0; l < 4; ++l) {
        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
        y[l+16] = d.x * (q[l] >>  4) + d.y;
    }
}

//================================== k-quants

template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;

    const uint8_t q = x[i].qs[32*n + l];
    dst_t * y = yy + i*QK_K + 128*n;

    float dall = __low2half(x[i].dm);
    float dmin = __high2half(x[i].dm);
    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
    const int is = tid/16;  // 0 or 1
    const int il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    dst_t * y = yy + i*QK_K + 16*is + il;
    float dall = __low2half(x[i].dm);
    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

#if QK_K == 256
    const int r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
    const int n = tid / 4;
    const int j = tid - 4*n;

    uint8_t m = 1 << (4*n + j);
    int is = 8*n + 2*j + is0;
    int shift = 2*j;

    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
    float d_all = x[i].d;
    float dl = d_all * (us - 32);

    dst_t * y = yy + i*QK_K + 128*n + 32*j;
    const uint8_t * q = x[i].qs + 32*n;
    const uint8_t * hm = x[i].hmask;

    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else
    const int tid = threadIdx.x;
    const int is  = tid/16;  // 0 or 1
    const int il  = tid%16;  // 0...15
    const int im  = il/8;    // 0...1
    const int in  = il%8;    // 0...7

    dst_t * y = yy + i*QK_K + 16*is + il;

    const uint8_t q = x[i].qs[il] >> (2*is);
    const uint8_t h = x[i].hmask[in] >> (2*is + im);
    const float   d = (float)x[i].d;

    if (is == 0) {
        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    } else {
        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
    }
#endif

}

#if QK_K == 256
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
    if (j < 4) {
        d = q[j] & 63; m = q[j + 4] & 63;
    } else {
        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
}
#endif

template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

    const int i = blockIdx.x;

#if QK_K == 256
    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
    const int n   = 4;

    dst_t * y = yy + i*QK_K + 64*il + n*ir;

    const float dall = __low2half(x[i].dm);
    const float dmin = __high2half(x[i].dm);

    const uint8_t * q = x[i].qs + 32*il + n*ir;

    uint8_t sc, m;
    get_scale_min_k4(is + 0, x[i].scales, sc, m);
    const float d1 = dall * sc; const float m1 = dmin * m;
    get_scale_min_k4(is + 1, x[i].scales, sc, m);
    const float d2 = dall * sc; const float m2 = dmin * m;
    for (int l = 0; l < n; ++l) {
        y[l + 0] = d1 * (q[l] & 0xF) - m1;
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
#else
    const int tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    dst_t * y = yy + i*QK_K;
    const float d = (float)x[i].dm[0];
    const float m = (float)x[i].dm[1];
    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
#endif
}

template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

    const int i = blockIdx.x;

#if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6

    dst_t * y = yy + i*QK_K + 64*il + 2*ir;

    const float dall = __low2half(x[i].dm);
    const float dmin = __high2half(x[i].dm);

    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
    const uint8_t * qh = x[i].qh + 2*ir;

    uint8_t sc, m;
    get_scale_min_k4(is + 0, x[i].scales, sc, m);
    const float d1 = dall * sc; const float m1 = dmin * m;
    get_scale_min_k4(is + 1, x[i].scales, sc, m);
    const float d2 = dall * sc; const float m2 = dmin * m;

    uint8_t   hm  = 1 << (2*il);
    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
    hm <<= 1;
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
    const int tid = threadIdx.x;
    const uint8_t q = x[i].qs[tid];
    const int im = tid/8;  // 0...3
    const int in = tid%8;  // 0...7
    const int is = tid/16; // 0 or 1
    const uint8_t h = x[i].qh[in] >> im;
    const float d = x[i].d;
    dst_t * y = yy + i*QK_K + tid;
    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
#endif
}

template<typename dst_t>
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

    const int i = blockIdx.x;
#if QK_K == 256

    // assume 64 threads - this is very slightly better than the one below
    const int tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;

    dst_t * y = yy + i*QK_K + 128*ip + il;

    const float d = x[i].d;

    const uint8_t * ql = x[i].ql + 64*ip + il;
    const uint8_t   qh = x[i].qh[32*ip + il];
    const int8_t  * sc = x[i].scales + is;

    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
#else

    // assume 32 threads
    const int tid = threadIdx.x;
    const int ip  = tid/16;         // 0 or 1
    const int il  = tid - 16*ip;    // 0...15

    dst_t * y = yy + i*QK_K + 16*ip + il;

    const float d = x[i].d;

    const uint8_t   ql = x[i].ql[16*ip + il];
    const uint8_t   qh = x[i].qh[il] >> (2*ip);
    const int8_t  * sc = x[i].scales;

    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
#endif
}

inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return true;
        default:
            return false;
    }
}

template<typename dst_t>
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t  * q3 = x[i].qs + 8*ib;
    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
    const uint32_t aux32 = gas[0] | (gas[1] << 16);
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 4; ++j) {
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * qs = x[i].qs + 8*ib;
    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
    const uint8_t signs = x[i].signs[4*ib + il];
    for (int j = 0; j < 4; ++j) {
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
#else
    assert(false);
#endif

}

template<typename dst_t>
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq1_s * x = (const block_iq1_s  *) vx;

    const int tid = threadIdx.x;
#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
#else
    assert(false);
#endif

}

static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

template<typename dst_t>
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

    const int i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

    const int tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[ib].qs + 4*il;
    const float d = (float)x[ib].d;
    for (int j = 0; j < 4; ++j) {
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }

}

#if QK_K != 64
template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const int i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

    const int tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
    for (int j = 0; j < 4; ++j) {
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
}
#endif

static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q2_K * x = (const block_q2_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int step = 16/K_QUANTS_PER_ITERATION;

    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7

    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
    const int q_offset = 32*im + l0;
    const int s_offset = 8*im;
    const int y_offset = 128*im + l0;

    uint32_t aux[4];
    const uint8_t * d = (const uint8_t *)aux;
    const uint8_t * m = (const uint8_t *)(aux + 2);

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
        aux[1] = a[1] & 0x0f0f0f0f;
        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;

        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];

        }
        tmp += dall * sum1 - dmin * sum2;

    }
#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;

    uint32_t uaux[2];
    const uint8_t * d = (const uint8_t *)uaux;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint32_t * s = (const uint32_t *)x[i].scales;

        uaux[0] = s[0] & 0x0f0f0f0f;
        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;

        const float2 dall = __half22float2(x[i].dm);

        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t ql = q[l];
            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
                  + y[l+16] * d[1] * ((ql >> 2) & 3)
                  + y[l+32] * d[2] * ((ql >> 4) & 3)
                  + y[l+48] * d[3] * ((ql >> 6) & 3);
            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
        }
        tmp += dall.x * sum1 - dall.y * sum2;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q3_K * x = (const block_q3_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256

    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
    const int step = 16/K_QUANTS_PER_ITERATION;
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0....15 or 0...7

    const uint8_t m = 1 << (4*im);

    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
    const int q_offset =  32*im + l0;
    const int y_offset = 128*im + l0;

    uint16_t utmp[4];
    const int8_t * s = (const int8_t *)utmp;

    const uint16_t s_shift = 4*im;

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
        const uint8_t * h = x[i].hmask + l0;

        const uint16_t * a = (const uint16_t *)x[i].scales;
        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);

        const float d = x[i].d;

        float sum = 0;
        for (int l = 0; l < n; ++l) {
            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
        }
        tmp += d * sum;

    }
#else

    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
    const int in = offset/8;                                 // 0 or 1
    const int im = offset%8;                                 // 0...7

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y = yy + i * QK_K + offset;
        const uint8_t * q = x[i].qs + offset;
        const uint8_t * s = x[i].scales;

        const float dall = (float)x[i].d;

        float sum = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            const uint8_t hl = x[i].hmask[im+l] >> in;
            const uint8_t ql = q[l];
            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
        }
        tmp += sum;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q4_K * x = (const block_q4_K *)vx + ib0;

#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4

    const int il  = tid/step;                            // 0...3
    const int ir  = tid - step*il;                       // 0...7 or 0...3
    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4

    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;

    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;

#if K_QUANTS_PER_ITERATION == 2
    uint32_t q32[4];
    const uint8_t * q4 = (const uint8_t *)q32;
#else
    uint16_t q16[4];
    const uint8_t * q4 = (const uint8_t *)q16;
#endif

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

#if K_QUANTS_PER_ITERATION == 2
        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
        const uint32_t * q2 = q1 + 16;

        q32[0] = q1[0] & 0x0f0f0f0f;
        q32[1] = q1[0] & 0xf0f0f0f0;
        q32[2] = q2[0] & 0x0f0f0f0f;
        q32[3] = q2[0] & 0xf0f0f0f0;

        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 4; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#else
        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
        const uint16_t * q2 = q1 + 32;

        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[0] & 0xf0f0;
        q16[2] = q2[0] & 0x0f0f;
        q16[3] = q2[0] & 0xf0f0;

        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 2; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#endif

    }
#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);

    const int step = tid * K_QUANTS_PER_ITERATION;

    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;

    float tmp = 0;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const float   * y = yy + i*QK_K + step;
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
        const float d = (float)x[i].dm[0];
        const float m = (float)x[i].dm[1];
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
        }
        tmp += sum;
    }

#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {

    const int row = blockIdx.x;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q5_K * x = (const block_q5_K *)vx + ib0;

    float tmp = 0; // partial sum for thread in warp

#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = threadIdx.x/2;  // 0...15
    const int ix  = threadIdx.x%2;

    const int il  = tid/4;     // 0...3
    const int ir  = tid - 4*il;// 0...3
    const int n   = 2;

    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;

    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

    const uint8_t hm1  = 1 << (2*im);
    const uint8_t hm2  = hm1 << 4;

    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;

    uint16_t q16[8];
    const uint8_t * q4 = (const uint8_t *)q16;

    for (int i = ix; i < num_blocks_per_row; i += 2) {

        const uint8_t * ql1 = x[i].qs + q_offset;
        const uint8_t * qh  = x[i].qh + l0;
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;

        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

        float4 sum = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        const uint16_t * q1 = (const uint16_t *)ql1;
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[8] & 0x0f0f;
        q16[2] = (q1[0] >> 4) & 0x0f0f;
        q16[3] = (q1[8] >> 4) & 0x0f0f;
        q16[4] = q2[0] & 0x0f0f;
        q16[5] = q2[8] & 0x0f0f;
        q16[6] = (q2[0] >> 4) & 0x0f0f;
        q16[7] = (q2[8] >> 4) & 0x0f0f;
        for (int l = 0; l < n; ++l) {
            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
        }
        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
    }

#else
    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
    const int step = tid * K_QUANTS_PER_ITERATION;
    const int im = step/8;
    const int in = step%8;

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
        const uint8_t * q = x[i].qs + step;
        const int8_t  * s = x[i].scales;
        const float   * y = yy + i*QK_K + step;
        const float     d = x[i].d;
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            const uint8_t h = x[i].qh[in+j] >> im;
            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
        }
        tmp += sum;
    }
#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
}

static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;

    const block_q6_K * x = (const block_q6_K *)vx + ib0;

#if QK_K == 256

    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1

    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8

    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7

#if K_QUANTS_PER_ITERATION == 1
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
    const int is = 0;
#else
    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
    const int is = in / 4;
#endif
    const int ql_offset = 64*im + l0;
    const int qh_offset = 32*im + l0;
    const int s_offset  =  8*im + is;
    const int y_offset = 128*im + l0;

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * ql = x[i].ql + ql_offset;
        const uint8_t * qh = x[i].qh + qh_offset;
        const int8_t  * s  = x[i].scales + s_offset;

        const float d = x[i].d;

#if K_QUANTS_PER_ITERATION == 1
        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
        tmp += sum;
#else
        float sum = 0;
        for (int l = 0; l < 4; ++l) {
            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
        }
        tmp += sum;
#endif

    }

#else

    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3

    const int step = tid * K_QUANTS_PER_ITERATION;

    float tmp = 0; // partial sum for thread in warp

    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

        const float   * y  = yy + i * QK_K + step;
        const uint8_t * ql = x[i].ql + step;
        const uint8_t * qh = x[i].qh + step;
        const int8_t  * s  = x[i].scales;

        const float d = x[i+0].d;

        float sum = 0;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
        }
        tmp += sum;

    }

#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
        dst[row] = tmp;
    }
}

static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const half * x = (const half *) vx;

    // automatic half -> float type cast if dfloat == float
    v.x = x[ib + iqs + 0];
    v.y = x[ib + iqs + 1];
}

static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
    const int ix = blockDim.x*blockIdx.x + threadIdx.x;

    if (ix >= kx_padded) {
        return;
    }

    const int iy = blockDim.y*blockIdx.y + threadIdx.y;

    const int i_padded = iy*kx_padded + ix;

    block_q8_1 * y = (block_q8_1 *) vy;

    const int ib = i_padded / QK8_1; // block index
    const int iqs = i_padded % QK8_1; // quant index

    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
    float amax = fabsf(xi);
    float sum = xi;

    amax = warp_reduce_max(amax);
    sum = warp_reduce_sum(sum);

    const float d = amax / 127;
    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);

    y[ib].qs[iqs] = q;

    if (iqs > 0) {
        return;
    }

    reinterpret_cast<half&>(y[ib].ds.x) = d;
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
}

template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
            const void * src0, const int32_t * src1, dst_t * dst,
            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
    }

    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;

    const int ib = i00/qk; // block index
    const int iqs = (i00%qk)/qr; // quant index
    const int iybs = i00 - i00%qk; // dst block start index
    const int y_offset = qr == 1 ? 1 : qk/2;

    // dequantize
    dfloat2 v;
    dequantize_kernel(src0_row, ib, iqs, v);

    dst_row[iybs + iqs + 0]        = v.x;
    dst_row[iybs + iqs + y_offset] = v.y;
}

template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
            const src0_t * src0, const int32_t * src1, dst_t * dst,
            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
    }

    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);

    dst_row[i00] = src0_row[i00];
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);

    if (i >= k) {
        return;
    }

    const int ib = i/qk; // block index
    const int iqs = (i%qk)/qr; // quant index
    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;

    // dequantize
    dfloat2 v;
    dequantize_kernel(vx, ib, iqs, v);

    y[iybs + iqs + 0]        = v.x;
    y[iybs + iqs + y_offset] = v.y;
}

template <typename src_t, typename dst_t>
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    const src_t * x = (src_t *) vx;

    y[i] = x[i];
}

template <bool need_check>
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
#if __CUDA_ARCH__ >= CC_PASCAL
    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
    const int * x0 = ((int *) vx) + blockIdx.x * nint;
    half2 * y2 = (half2 *) (y + i0);

    __shared__ int vals[nint];

#pragma unroll
    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
            break;
        }

        const int ix = ix0 + threadIdx.x;
        vals[ix] = x0[ix];
    }

#pragma unroll
    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
            return;
        }

        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
        const half    d = *b0;
        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];

        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
    }
#else
    (void) vx; (void) y; (void) k;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL
}

// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
    const int * v, const int * u, const float & d4, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

        // SIMD dot product of quantized values
        sumi = __dp4a(vi0, u[2*i+0], sumi);
        sumi = __dp4a(vi1, u[2*i+1], sumi);
    }

    const float2 ds8f = __half22float2(ds8);

    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q4_1_Q8_1_MMVQ 2
#define VDR_Q4_1_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

        // SIMD dot product of quantized values
        sumi = __dp4a(vi0, u[2*i+0], sumi);
        sumi = __dp4a(vi1, u[2*i+1], sumi);
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
    const float d4d8 = tmp.x;
    const float m4s8 = tmp.y;
#else
    const float2 dm4f = __half22float2(dm4);
    const float2 ds8f = __half22float2(ds8);
    const float d4d8 = dm4f.x * ds8f.x;
    const float m4s8 = dm4f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_0_Q8_1_MMVQ 2
#define VDR_Q5_0_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
    }

    const float2 ds8f = __half22float2(ds8);

    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_1_Q8_1_MMVQ 2
#define VDR_Q5_1_Q8_1_MMQ  4

template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
    const float d5d8 = tmp.x;
    const float m5s8 = tmp.y;
#else
    const float2 dm5f = __half22float2(dm5);
    const float2 ds8f = __half22float2(ds8);
    const float d5d8 = dm5f.x * ds8f.x;
    const float m5s8 = dm5f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q8_0_Q8_1_MMVQ 2
#define VDR_Q8_0_Q8_1_MMQ 8

template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
    const int * v, const int * u, const float & d8_0, const float & d8_1) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        // SIMD dot product of quantized values
        sumi = __dp4a(v[i], u[i], sumi);
    }

    return d8_0*d8_1 * sumi;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i = 0; i < vdr; ++i) {
        // SIMD dot product of quantized values
        sumi = __dp4a(v[i], u[i], sumi);
    }

#ifdef GGML_CUDA_F16
    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
    const float d8d8 = tmp.x;
    const float m8s8 = tmp.y;
#else
    const float2 dm8f = __half22float2(dm8);
    const float2 ds8f = __half22float2(ds8);
    const float d8d8 = dm8f.x * ds8f.x;
    const float m8s8 = dm8f.y * ds8f.y;
#endif // GGML_CUDA_F16

    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q2_K_Q8_1_MMVQ 1
#define VDR_Q2_K_Q8_1_MMQ  2

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const half2 & dm2, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR2_K; ++i) {
        const int sc = scales[2*i];

        const int vi = (v >> (2*i)) & 0x03030303;

        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product

        // fill int with 4x m
        int m = sc >> 4;
        m |= m <<  8;
        m |= m << 16;
        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
    }

    const float2 dm2f = __half22float2(dm2);

    return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const half2 & dm2, const float & d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi_d = 0;
    int sumi_m = 0;

#pragma unroll
    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
        int sumi_d_sc = 0;

        const int sc = scales[i0 / (QI8_1/2)];

        // fill int with 4x m
        int m = sc >> 4;
        m |= m <<  8;
        m |= m << 16;

#pragma unroll
        for (int i = i0; i < i0 + QI8_1/2; ++i) {
            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
        }

        sumi_d += sumi_d_sc * (sc & 0xF);
    }

    const float2 dm2f = __half22float2(dm2);

    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q3_K_Q8_1_MMVQ 1
#define VDR_Q3_K_Q8_1_MMQ  2

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
    const int & scale_offset, const float & d3, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf = 0.0f;

#pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        const int isc = scale_offset + 2*i;

        const int isc_low = isc % (QK_K/32);
        const int sc_shift_low = 4 * (isc / (QK_K/32));
        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;

        const int isc_high = isc % (QK_K/64);
        const int sc_shift_high = 2 * (isc / (QK_K/64));
        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;

        const int sc = (sc_low | sc_high) - 32;

        const int vil = (vl >> (2*i)) & 0x03030303;

        const int vih = ((vh >> i) << 2) & 0x04040404;

        const int vi = __vsubss4(vil, vih);

        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
    }

    return d3 * sumf;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
    const float & d3, const float & d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    int sumi = 0;

#pragma unroll
    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
        int sumi_sc = 0;

        for (int i = i0; i < i0 + QI8_1/2; ++i) {
            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
        }

        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
    }

    return d3*d8 * sumi;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q4_K_Q8_1_MMVQ 2
#define VDR_Q4_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR4_K; ++i) {
        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;

        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u

        sumf_d += d8[i] * (dot1 * sc[i]);
        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
        int sumi_d = 0;

#pragma unroll
        for (int j = 0; j < QI8_1; ++j) {
            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
        }

        const float2 ds8f = __half22float2(ds8[i]);

        sumf_d += ds8f.x * (sc[i] * sumi_d);
        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q5_K_Q8_1_MMVQ 2
#define VDR_Q5_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;

        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;

        const int v0i = vl0i | vh0i;
        const int v1i = vl1i | vh1i;

        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u

        sumf_d += d8[i] * (dot1 * sc[i]);
        sumf_m += d8[i] * (dot2 * m[i]);

    }

    const float2 dm5f = __half22float2(dm5);

    return dm5f.x*sumf_d - dm5f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

#pragma unroll
    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
        int sumi_d = 0;

#pragma unroll
        for (int j = 0; j < QI8_1; ++j) {
            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
        }

        const float2 ds8f = __half22float2(ds8[i]);

        sumf_d += ds8f.x * (sc[i] * sumi_d);
        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
    }

    const float2 dm4f = __half22float2(dm4);

    return dm4f.x*sumf_d - dm4f.y*sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

#define VDR_Q6_K_Q8_1_MMVQ 1
#define VDR_Q6_K_Q8_1_MMQ  8

// contiguous v/x values
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
    const float & d, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf = 0.0f;

#pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        const int sc = scales[4*i];

        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;

        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;

        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32

        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
    }

    return d*sumf;
#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
    const float & d6, const float * __restrict__ d8) {

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    float sumf_d = 0.0f;

#pragma unroll
    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale

#pragma unroll
        for (int i = i0; i < i0 + 2; ++i) {
            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product

            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
        }

        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
    }

    return d6 * sumf_d;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}

static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

    int v[VDR_Q4_0_Q8_1_MMVQ];
    int u[2*VDR_Q4_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
    }

    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh; (void)x_sc;

    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];

    *x_ql = tile_x_qs;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh; (void)x_sc;
    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_0;
    const int kqsx = k % QI4_0;

    const block_q4_0 * bx0 = (const block_q4_0 *) vx;

    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh; (void)x_sc;

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const float * x_dmf = (const float *) x_dm;

    int u[2*VDR_Q4_0_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
    }

    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;

    int v[VDR_Q4_1_Q8_1_MMVQ];
    int u[2*VDR_Q4_1_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
    }

    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh; (void)x_sc;

    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];

    *x_ql = tile_x_qs;
    *x_dm = tile_x_dm;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh; (void)x_sc;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_1;
    const int kqsx = k % QI4_1;

    const block_q4_1 * bx0 = (const block_q4_1 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
    }
}

static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh; (void)x_sc;

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));

    int u[2*VDR_Q4_1_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
    }

    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;

    int vl[VDR_Q5_0_Q8_1_MMVQ];
    int vh[VDR_Q5_0_Q8_1_MMVQ];
    int  u[2*VDR_Q5_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
    }

    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh; (void)x_sc;

    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];

    *x_ql = tile_x_ql;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh; (void)x_sc;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_0;
    const int kqsx = k % QI5_0;

    const block_q5_0 * bx0 = (const block_q5_0 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;

        const int ql = get_int_from_uint8(bxi->qs, kqsx);
        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));

        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;

        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
    const int kbxd = k % blocks_per_tile_x_row;
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh; (void)x_sc;

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    int u[2*VDR_Q5_0_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
    }

    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;

    int vl[VDR_Q5_1_Q8_1_MMVQ];
    int vh[VDR_Q5_1_Q8_1_MMVQ];
    int  u[2*VDR_Q5_1_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
    }

    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh; (void)x_sc;

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh; (void)x_sc;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset < nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_1;
    const int kqsx = k % QI5_1;

    const block_q5_1 * bx0 = (const block_q5_1 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;

        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));

        int qs0 = (ql >>  0) & 0x0F0F0F0F;
        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;

        int qs1 = (ql >>  4) & 0x0F0F0F0F;
        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28

        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
    }
}

static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh; (void)x_sc;

    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;

    int u[2*VDR_Q5_1_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
    }

    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
}

static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;

    int v[VDR_Q8_0_Q8_1_MMVQ];
    int u[VDR_Q8_0_Q8_1_MMVQ];

#pragma unroll
    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
    }

    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh; (void)x_sc;

    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];

    *x_ql = tile_x_qs;
    *x_dm = (half2 *) tile_x_d;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh; (void)x_sc;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI8_0;
    const int kqsx = k % QI8_0;
    float * x_dmf = (float *) x_dm;

    const block_q8_0 * bx0 = (const block_q8_0 *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
    }
}

static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh; (void)x_sc;

    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q2_K * bq2_K = (const block_q2_K *) vbq;

    const int bq8_offset = QR2_K * (iqs / QI8_1);
    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

    const uint8_t * scales = bq2_K->scales + scale_offset;

    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
    int    u[QR2_K];
    float d8[QR2_K];

#pragma unroll
    for (int i = 0; i < QR2_K; ++ i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh;

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI2_K;
    const int kqsx = k % QI2_K;

    const block_q2_K * bx0 = (const block_q2_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
    const int kbxd = k % blocks_per_tile_x_row;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);

        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
    }
}

static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh;

    const int kbx = k / QI2_K;
    const int ky  = (k % QI2_K) * QR2_K;
    const float * y_df = (const float *) y_ds;

    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];

    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));

#pragma unroll
    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
    }

    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;

    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q3_K * bq3_K = (const block_q3_K *) vbq;

    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

    const float d = bq3_K->d;

    const int vl = get_int_from_uint8(bq3_K->qs, iqs);

    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;

    int    u[QR3_K];
    float d8[QR3_K];

#pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_qh = tile_x_qh;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI3_K;
    const int kqsx = k % QI3_K;

    const block_q3_K * bx0 = (const block_q3_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
    const int kbxd = k % blocks_per_tile_x_row;
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);

        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);

        const int ksc = k % (QI3_K/4);

        const int ksc_low = ksc % (QI3_K/8);
        const int shift_low = 4 * (ksc / (QI3_K/8));
        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;

        const int ksc_high = QI3_K/8;
        const int shift_high = 2 * ksc;
        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;

        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);

        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
    }
}

static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {

    const int kbx  = k / QI3_K;
    const int ky  = (k % QI3_K) * QR3_K;
    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;

    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];

#pragma unroll
    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
        const int shift = 2 * ((ky % 32) / 8);
        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;

        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
        const int vlh = (vh << 2) & 0x04040404;

        v[l] = __vsubss4(vll, vlh);
    }

    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#ifndef GGML_QKK_64
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    int    v[2];
    int    u[2*QR4_K];
    float d8[QR4_K];

    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));

    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108

    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
    v[0] = q4[0];
    v[1] = q4[4];

    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;

    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        d8[i] = __low2float(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
        u[2*i+1] = q8[4];
    }

    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);

#else

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    float sumf_d = 0.0f;
    float sumf_m = 0.0f;

    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;

    const uint16_t * a = (const uint16_t *)bq4_K->scales;
    aux16[0] = a[0] & 0x0f0f;
    aux16[1] = (a[0] >> 4) & 0x0f0f;

    const float dall = bq4_K->dm[0];
    const float dmin = bq4_K->dm[1];

    const float d8_1 = __low2float(bq8_1[0].ds);
    const float d8_2 = __low2float(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
    const int v1 = q4[0];
    const int v2 = q4[4];

    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));

    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);

    return dall * sumf_d - dmin * sumf_m;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

#endif
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh;

    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
    const int kqsx = k % QI4_K; // == k if QK_K == 256

    const block_q4_K * bx0 = (const block_q4_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;

        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;

#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
#else
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
#endif
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);

        const int * scales = (const int *) bxi->scales;

        const int ksc = k % (WARP_SIZE/8);

        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits

        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
    }
}

static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh;

    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);

    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#ifndef GGML_QKK_64
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    int   vl[2];
    int   vh[2];
    int    u[2*QR5_K];
    float d8[QR5_K];

    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));

    vl[0] = ql[0];
    vl[1] = ql[4];

    vh[0] = qh[0] >> bq8_offset;
    vh[1] = qh[4] >> bq8_offset;

    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;

#pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        d8[i] = __low2float(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
        u[2*i+1] = q8[4];
    }

    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);

#else

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    const int8_t * s = bq5_K->scales;

    const float d = bq5_K->d;

    const float d8_1 = __low2half(bq8_1[0].ds);
    const float d8_2 = __low2half(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

    const int * ql = (const int *)bq5_K->qs + (iqs/2);
    const int vl1 = ql[0];
    const int vl2 = ql[4];

    const int step = 4 * (iqs/2); // 0, 4, 8, 12
    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
    const int in = step%8; // 0, 4, 0, 4
    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;

    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);

    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);

    return d * sumf_d;

#else
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

#endif
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh;

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
    const int kqsx = k % QI5_K; // == k if QK_K == 256

    const block_q5_K * bx0 = (const block_q5_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
        const int ky = QR5_K*kqsx;

        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
        const int ql1 = (ql >> 4) & 0x0F0F0F0F;

        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;

        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);

        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;

#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
#endif
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);

        const int * scales = (const int *) bxi->scales;

        const int ksc = k % (WARP_SIZE/8);

        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits

        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
    }
}

static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh;

    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);

    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_q6_K * bq6_K = (const block_q6_K *) vbq;

    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));

    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;

    const int8_t * scales = bq6_K->scales + scale_offset;

    int    u[QR6_K];
    float d8[QR6_K];

#pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
    }

    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
}

template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
    (void)x_qh;

    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];

    *x_ql = tile_x_ql;
    *x_dm = tile_x_dm;
    *x_sc = tile_x_sc;
}

template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
    (void)x_qh;

    GGML_CUDA_ASSUME(i_offset >= 0);
    GGML_CUDA_ASSUME(i_offset <  nwarps);
    GGML_CUDA_ASSUME(k >= 0);
    GGML_CUDA_ASSUME(k <  WARP_SIZE);

    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
    const int kqsx = k % QI6_K; // == k if QK_K == 256

    const block_q6_K * bx0 = (const block_q6_K *) vx;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
        int i = i0 + i_offset;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
        const int ky = QR6_K*kqsx;

        const int ql = get_int_from_uint8(bxi->ql, kqsx);
        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
        const int ql1 = (ql >> 4) & 0x0F0F0F0F;

        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;

        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);

        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
    }

    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
    float * x_dmf = (float *) x_dm;

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;

        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
    }

#pragma unroll
    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;

        if (need_check) {
            i = min(i, i_max);
        }

        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;

        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
    }
}

static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
    (void)x_qh;

    const float * x_dmf = (const float *) x_dm;
    const float * y_df  = (const float *) y_ds;

    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);

    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
}

static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if QK_K == 256
    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

#if QR2_XXS == 8
    const int ib32 = iqs;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const int8_t   * q8 = bq8_1[ib32].qs;
    uint32_t aux32 = q2[2] | (q2[3] << 16);
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
        for (int j = 0; j < 8; ++j) {
            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
        }
        q8 += 8;
        aux32 >>= 7;
    }
    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * sumi;
#else
    // iqs is 0...15
    const int ib32 = iqs/2;
    const int il = iqs%2;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const uint8_t  * aux8 = (const uint8_t *)q2;
    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
    int sumi1 = 0, sumi2 = 0;
    for (int j = 0; j < 8; ++j) {
        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
    }
    return d * (sumi1 + sumi2);
#endif
#else
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

    const int ib32 = iqs;
    const uint16_t * q2 = bq2->qs + 4*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
    const uint8_t ls2 = bq2->scales[ib32] >>  4;
    int sumi1 = 0;
    for (int l = 0; l < 2; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
        q8 += 8;
    }
    int sumi2 = 0;
    for (int l = 2; l < 4; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
        q8 += 8;
    }
    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
#else
    (void) ksigns64;
    assert(false);
    return 0.f;
#endif
#else
    (void) ksigns64;
    assert(false);
    return 0.f;
#endif
}

// TODO
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

    const int ib32 = iqs;
    const int8_t  * q8 = bq8_1[ib32].qs;
    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
    const uint8_t ls2 = bq2->scales[ib32] >>  4;
    int sumi1 = 0;
    for (int l = 0; l < 2; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
        q8 += 8;
    }
    int sumi2 = 0;
    for (int l = 2; l < 4; ++l) {
        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
        q8 += 8;
    }
    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
#else
    (void) ksigns64;
    assert(false);
    return 0.f;
#endif
#else
    (void) ksigns64;
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

    const int ib32 = iqs;
    const uint8_t  * q3 = bq2->qs + 8*ib32;
    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    uint32_t aux32 = gas[0] | (gas[1] << 16);
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
        q8 += 8;
        aux32 >>= 7;
    }
    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
    return d * sumi;
#else
    assert(false);
    return 0.f;
#endif
#else
    assert(false);
    return 0.f;
#endif
}

// TODO: don't use lookup table for signs
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if QK_K == 256
    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

    const int ib32 = iqs;
    const uint8_t  * qs = bq2->qs + 8*ib32;
    const int8_t   * q8 = bq8_1[ib32].qs;
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
        q8 += 8;
    }
    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
    return d * sumi;
#else
    assert(false);
    return 0.f;
#endif
#else
    assert(false);
    return 0.f;
#endif
}

static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
#if QK_K == 256
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

    const int ib32 = iqs;
    int sumi = 0;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const int * q8 = (const int *)bq8_1[ib32].qs;
    for (int l = 0; l < 4; ++l) {
        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
        int grid0 = grid[0] & 0x0f0f0f0f;
        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
    }
#else
    const int8_t * q8 = bq8_1[ib32].qs;
    for (int l = 0; l < 4; ++l) {
        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
        for (int j = 0; j < 4; ++j) {
            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
        }
        q8 += 8;
    }
#endif
    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
    const float d = d1q * __low2float (bq8_1[ib32].ds);
    const float m = d1q * __high2float(bq8_1[ib32].ds);
    return d * sumi + m * delta;
#else
    assert(false);
    return 0.f;
#endif
}

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
        int & val1, int & val2) {

    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
    aux32 = q4 & 0x0f0f0f0f;
    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
    val1 = v1 | (v2 << 16);
    aux32 = (q4 >> 4) & 0x0f0f0f0f;
    v1 = values[q8[0]] | (values[q8[1]] << 8);
    v2 = values[q8[2]] | (values[q8[3]] << 8);
    val2 = v1 | (v2 << 16);
}
#endif

static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;

#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;

    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

    int v1, v2;
    int sumi1 = 0, sumi2 = 0;
    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
        get_int_from_table_16(aux, values, v1, v2);
        sumi1 = __dp4a(v1, q8[l+0], sumi1);
        sumi2 = __dp4a(v2, q8[l+4], sumi2);
    }

#else
    const uint8_t * q4 = bq->qs + 4*iqs;
    const int8_t  * q8 = bq8_1->qs + 4*iqs;

    int sumi1 = 0, sumi2 = 0;
    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
    }
#endif
    const float d = (float)bq->d * __low2float(bq8_1->ds);
    return d * (sumi1 + sumi2);
}

static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

#if QK_K == 256
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

    //// iqs is 0...7
    //const int ib64 = iqs/2;
    //const int il = iqs%2;
    //const int32_t  * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
    //const int32_t  * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
    //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
    //const uint32_t * q4_2 = q4_1 + 4;
    //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
    //const int8_t ls2 = (bq4->scales_l[ib64] >>  4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
    //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
    //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
    //int v1, v2;
    //int sumi1 = 0, sumi2 = 0;
    //for (int j = 0; j < 2; ++j) {
    //    get_int_from_table_16(q4_1[j], values, v1, v2);
    //    sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
    //    get_int_from_table_16(q4_2[j], values, v1, v2);
    //    sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
    //}
    //return d1 * sumi1 + d2 * sumi2;

    // iqs is 0...7
    const int ib32 = iqs;
    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
    int v1, v2;
    int sumi1 = 0, sumi2 = 0;
    for (int j = 0; j < 4; ++j) {
        get_int_from_table_16(q4[j], values, v1, v2);
        sumi1 = __dp4a(v1, q8[j+0], sumi1);
        sumi2 = __dp4a(v2, q8[j+4], sumi2);
    }
    return d * (sumi1 + sumi2);

    //// iqs is 0...15
    //const int ib32 = iqs/2;
    //const int il = iqs%2;
    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs + 2*il;
    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
    //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
    //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
    //int v1, v2;
    //int sumi1 = 0, sumi2 = 0;
    //for (int j = 0; j < 2; ++j) {
    //    get_int_from_table_16(q4[j], values, v1, v2);
    //    sumi1 = __dp4a(v1, q8[j+0], sumi1);
    //    sumi2 = __dp4a(v2, q8[j+4], sumi2);
    //}
    //return d * (sumi1 + sumi2);
#else
    assert(false);
    return 0.f;
#endif
#else
    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
#endif
}

template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
static __device__ __forceinline__ void mul_mat_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

    const int blocks_per_row_x = ncols_x / qk;
    const int blocks_per_col_y = nrows_y / QK8_1;
    const int blocks_per_warp = WARP_SIZE / qi;

    const int & ncols_dst = ncols_y;

    const int row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

    const int col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
    half2 * tile_x_dm = nullptr;
    int   * tile_x_qh = nullptr;
    int   * tile_x_sc = nullptr;

    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);

    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];

    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};

    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {

        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);

#pragma unroll
        for (int ir = 0; ir < qr; ++ir) {
            const int kqs = ir*WARP_SIZE + threadIdx.x;
            const int kbxd = kqs / QI8_1;

#pragma unroll
            for (int i = 0; i < mmq_x; i += nwarps) {
                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses

                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];

                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
            }

#pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
                if (need_sum) {
                    *dsi_dst = *dsi_src;
                } else {
                    float * dfi_dst = (float *) dsi_dst;
                    *dfi_dst = __low2float(*dsi_src);
                }
            }

            __syncthreads();

// #pragma unroll // unrolling this loop causes too much register pressure
            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
#pragma unroll
                for (int j = 0; j < mmq_x; j += nwarps) {
#pragma unroll
                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
                            threadIdx.x + i, threadIdx.y + j, k);
                    }
                }
            }

            __syncthreads();
        }
    }

#pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
        const int col_dst = col_dst_0 + j + threadIdx.y;

        if (col_dst >= ncols_dst) {
            return;
        }

#pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
            const int row_dst = row_dst_0 + threadIdx.x + i;

            if (row_dst >= nrows_dst) {
                continue;
            }

            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
        }
    }
}

#define  MMQ_X_Q4_0_RDNA2  64
#define  MMQ_Y_Q4_0_RDNA2  128
#define NWARPS_Q4_0_RDNA2  8
#define  MMQ_X_Q4_0_RDNA1  64
#define  MMQ_Y_Q4_0_RDNA1  64
#define NWARPS_Q4_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_0_AMPERE 4
#define  MMQ_Y_Q4_0_AMPERE 32
#define NWARPS_Q4_0_AMPERE 4
#else
#define  MMQ_X_Q4_0_AMPERE 64
#define  MMQ_Y_Q4_0_AMPERE 128
#define NWARPS_Q4_0_AMPERE 4
#endif
#define  MMQ_X_Q4_0_PASCAL 64
#define  MMQ_Y_Q4_0_PASCAL 64
#define NWARPS_Q4_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q4_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
    const int nwarps = NWARPS_Q4_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
    const int nwarps = NWARPS_Q4_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
    const int nwarps = NWARPS_Q4_0_AMPERE;

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
    const int nwarps = NWARPS_Q4_0_PASCAL;

    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q4_0_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q4_1_RDNA2  64
#define  MMQ_Y_Q4_1_RDNA2  128
#define NWARPS_Q4_1_RDNA2  8
#define  MMQ_X_Q4_1_RDNA1  64
#define  MMQ_Y_Q4_1_RDNA1  64
#define NWARPS_Q4_1_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_1_AMPERE 4
#define  MMQ_Y_Q4_1_AMPERE 32
#define NWARPS_Q4_1_AMPERE 4
#else
#define  MMQ_X_Q4_1_AMPERE 64
#define  MMQ_Y_Q4_1_AMPERE 128
#define NWARPS_Q4_1_AMPERE 4
#endif
#define  MMQ_X_Q4_1_PASCAL 64
#define  MMQ_Y_Q4_1_PASCAL 64
#define NWARPS_Q4_1_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q4_1(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
    const int nwarps = NWARPS_Q4_1_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
    const int nwarps = NWARPS_Q4_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
    const int nwarps = NWARPS_Q4_1_AMPERE;

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
    const int nwarps = NWARPS_Q4_1_PASCAL;

    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q4_1_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_0_RDNA2  64
#define  MMQ_Y_Q5_0_RDNA2  128
#define NWARPS_Q5_0_RDNA2  8
#define  MMQ_X_Q5_0_RDNA1  64
#define  MMQ_Y_Q5_0_RDNA1  64
#define NWARPS_Q5_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_0_AMPERE 4
#define  MMQ_Y_Q5_0_AMPERE 32
#define NWARPS_Q5_0_AMPERE 4
#else
#define  MMQ_X_Q5_0_AMPERE 128
#define  MMQ_Y_Q5_0_AMPERE 64
#define NWARPS_Q5_0_AMPERE 4
#endif
#define  MMQ_X_Q5_0_PASCAL 64
#define  MMQ_Y_Q5_0_PASCAL 64
#define NWARPS_Q5_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q5_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
    const int nwarps = NWARPS_Q5_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
    const int nwarps = NWARPS_Q5_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
    const int nwarps = NWARPS_Q5_0_AMPERE;

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
    const int nwarps = NWARPS_Q5_0_PASCAL;

    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q5_0_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_1_RDNA2  64
#define  MMQ_Y_Q5_1_RDNA2  128
#define NWARPS_Q5_1_RDNA2  8
#define  MMQ_X_Q5_1_RDNA1  64
#define  MMQ_Y_Q5_1_RDNA1  64
#define NWARPS_Q5_1_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_1_AMPERE 4
#define  MMQ_Y_Q5_1_AMPERE 32
#define NWARPS_Q5_1_AMPERE 4
#else
#define  MMQ_X_Q5_1_AMPERE 128
#define  MMQ_Y_Q5_1_AMPERE 64
#define NWARPS_Q5_1_AMPERE 4
#endif
#define  MMQ_X_Q5_1_PASCAL 64
#define  MMQ_Y_Q5_1_PASCAL 64
#define NWARPS_Q5_1_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_1(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
    const int nwarps = NWARPS_Q5_1_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
    const int nwarps = NWARPS_Q5_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
    const int nwarps = NWARPS_Q5_1_AMPERE;

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
    const int nwarps = NWARPS_Q5_1_PASCAL;

    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q5_1_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q8_0_RDNA2  64
#define  MMQ_Y_Q8_0_RDNA2  128
#define NWARPS_Q8_0_RDNA2  8
#define  MMQ_X_Q8_0_RDNA1  64
#define  MMQ_Y_Q8_0_RDNA1  64
#define NWARPS_Q8_0_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q8_0_AMPERE 4
#define  MMQ_Y_Q8_0_AMPERE 32
#define NWARPS_Q8_0_AMPERE 4
#else
#define  MMQ_X_Q8_0_AMPERE 128
#define  MMQ_Y_Q8_0_AMPERE 64
#define NWARPS_Q8_0_AMPERE 4
#endif
#define  MMQ_X_Q8_0_PASCAL 64
#define  MMQ_Y_Q8_0_PASCAL 64
#define NWARPS_Q8_0_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    mul_mat_q8_0(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
    const int nwarps = NWARPS_Q8_0_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
    const int nwarps = NWARPS_Q8_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
    const int nwarps = NWARPS_Q8_0_AMPERE;

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
    const int nwarps = NWARPS_Q8_0_PASCAL;

    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q8_0_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q2_K_RDNA2  64
#define  MMQ_Y_Q2_K_RDNA2  128
#define NWARPS_Q2_K_RDNA2  8
#define  MMQ_X_Q2_K_RDNA1  128
#define  MMQ_Y_Q2_K_RDNA1  32
#define NWARPS_Q2_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q2_K_AMPERE 4
#define  MMQ_Y_Q2_K_AMPERE 32
#define NWARPS_Q2_K_AMPERE 4
#else
#define  MMQ_X_Q2_K_AMPERE 64
#define  MMQ_Y_Q2_K_AMPERE 128
#define NWARPS_Q2_K_AMPERE 4
#endif
#define  MMQ_X_Q2_K_PASCAL 64
#define  MMQ_Y_Q2_K_PASCAL 64
#define NWARPS_Q2_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q2_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
    const int nwarps = NWARPS_Q2_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
    const int nwarps = NWARPS_Q2_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
    const int nwarps = NWARPS_Q2_K_AMPERE;

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
    const int nwarps = NWARPS_Q2_K_PASCAL;

    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q2_K_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q3_K_RDNA2  128
#define  MMQ_Y_Q3_K_RDNA2  64
#define NWARPS_Q3_K_RDNA2  8
#define  MMQ_X_Q3_K_RDNA1  32
#define  MMQ_Y_Q3_K_RDNA1  128
#define NWARPS_Q3_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q3_K_AMPERE 4
#define  MMQ_Y_Q3_K_AMPERE 32
#define NWARPS_Q3_K_AMPERE 4
#else
#define  MMQ_X_Q3_K_AMPERE 128
#define  MMQ_Y_Q3_K_AMPERE 128
#define NWARPS_Q3_K_AMPERE 4
#endif
#define  MMQ_X_Q3_K_PASCAL 64
#define  MMQ_Y_Q3_K_PASCAL 64
#define NWARPS_Q3_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q3_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
    const int nwarps = NWARPS_Q3_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
    const int nwarps = NWARPS_Q3_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
    const int nwarps = NWARPS_Q3_K_AMPERE;

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
    const int nwarps = NWARPS_Q3_K_PASCAL;

    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q3_K_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q4_K_RDNA2  64
#define  MMQ_Y_Q4_K_RDNA2  128
#define NWARPS_Q4_K_RDNA2  8
#define  MMQ_X_Q4_K_RDNA1  32
#define  MMQ_Y_Q4_K_RDNA1  64
#define NWARPS_Q4_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q4_K_AMPERE 4
#define  MMQ_Y_Q4_K_AMPERE 32
#define NWARPS_Q4_K_AMPERE 4
#else
#define  MMQ_X_Q4_K_AMPERE 64
#define  MMQ_Y_Q4_K_AMPERE 128
#define NWARPS_Q4_K_AMPERE 4
#endif
#define  MMQ_X_Q4_K_PASCAL 64
#define  MMQ_Y_Q4_K_PASCAL 64
#define NWARPS_Q4_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q4_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
    const int nwarps = NWARPS_Q4_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
    const int nwarps = NWARPS_Q4_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
    const int nwarps = NWARPS_Q4_K_AMPERE;

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
    const int nwarps = NWARPS_Q4_K_PASCAL;

    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q4_K_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q5_K_RDNA2  64
#define  MMQ_Y_Q5_K_RDNA2  128
#define NWARPS_Q5_K_RDNA2  8
#define  MMQ_X_Q5_K_RDNA1  32
#define  MMQ_Y_Q5_K_RDNA1  64
#define NWARPS_Q5_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q5_K_AMPERE 4
#define  MMQ_Y_Q5_K_AMPERE 32
#define NWARPS_Q5_K_AMPERE 4
#else
#define  MMQ_X_Q5_K_AMPERE 64
#define  MMQ_Y_Q5_K_AMPERE 128
#define NWARPS_Q5_K_AMPERE 4
#endif
#define  MMQ_X_Q5_K_PASCAL 64
#define  MMQ_Y_Q5_K_PASCAL 64
#define NWARPS_Q5_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
    const int nwarps = NWARPS_Q5_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
    const int nwarps = NWARPS_Q5_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
    const int nwarps = NWARPS_Q5_K_AMPERE;

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
    const int nwarps = NWARPS_Q5_K_PASCAL;

    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q5_K_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

#define  MMQ_X_Q6_K_RDNA2  64
#define  MMQ_Y_Q6_K_RDNA2  128
#define NWARPS_Q6_K_RDNA2  8
#define  MMQ_X_Q6_K_RDNA1  32
#define  MMQ_Y_Q6_K_RDNA1  64
#define NWARPS_Q6_K_RDNA1  8
#if defined(CUDA_USE_TENSOR_CORES)
#define  MMQ_X_Q6_K_AMPERE 4
#define  MMQ_Y_Q6_K_AMPERE 32
#define NWARPS_Q6_K_AMPERE 4
#else
#define  MMQ_X_Q6_K_AMPERE 64
#define  MMQ_Y_Q6_K_AMPERE 64
#define NWARPS_Q6_K_AMPERE 4
#endif
#define  MMQ_X_Q6_K_PASCAL 64
#define  MMQ_Y_Q6_K_PASCAL 64
#define NWARPS_Q6_K_PASCAL 8

template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_VOLTA
    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_VOLTA
    mul_mat_q6_K(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
    const int nwarps = NWARPS_Q6_K_RDNA2;
#else
    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
    const int nwarps = NWARPS_Q6_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= CC_VOLTA
    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
    const int nwarps = NWARPS_Q6_K_AMPERE;

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);

#elif __CUDA_ARCH__ >= MIN_CC_DP4A
    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
    const int nwarps = NWARPS_Q6_K_PASCAL;

    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
    (void) vec_dot_q6_K_q8_1_mul_mat;
    NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}

template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
#else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)

    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    const     int row0 = rows_per_cuda_block*blockIdx.x;
    const     int blocks_per_row_x = ncols_x / qk;
    const     int blocks_per_col_y = nrows_y / QK8_1;
    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;

// partial sum for each thread
    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx

        // x block quant index when casting the quants to int
        const int kqs = vdr * (tid % (qi/vdr));

#pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp[j][i] += vec_dot_q_cuda(
                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
            }
        }
    }

    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
    if (threadIdx.y > 0) {
#pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
            }
        }
    }
    __syncthreads();
    if (threadIdx.y > 0) {
        return;
    }

    // sum up partial sums and write back result
#pragma unroll
    for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
        for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
            for (int l = 0; l < nwarps-1; ++l) {
                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
            }
            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
        }

        if (threadIdx.x < rows_per_cuda_block) {
            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
    // qk = quantized weights per x block
    // qr = number of quantized weights per data value in x block
    const int row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
    }

    const int tid = threadIdx.x;

    const int iter_stride = 2*GGML_CUDA_DMMV_X;
    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
    const int y_offset = qr == 1 ? 1 : qk/2;

// partial sum for each thread
#ifdef GGML_CUDA_F16
    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
#else
    float tmp = 0.0f;
#endif // GGML_CUDA_F16

    for (int i = 0; i < ncols; i += iter_stride) {
        const int col = i + vals_per_iter*tid;
        const int ib = (row*ncols + col)/qk; // x block index
        const int iqs = (col%qk)/qr; // x quant index
        const int iybs = col - col%qk; // y block start index

// processing >2 values per i iter is faster for fast GPUs
#pragma unroll
        for (int j = 0; j < vals_per_iter; j += 2) {
            // process 2 vals per j iter

            // dequantize
            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
            dfloat2 v;
            dequantize_kernel(vx, ib, iqs + j/qr, v);

            // matrix multiplication
            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_F16
            tmp += __hmul2(v, {
                y[iybs + iqs + j/qr + 0],
                y[iybs + iqs + j/qr + y_offset]
            });
#else
            tmp += v.x * y[iybs + iqs + j/qr + 0];
            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
#endif // GGML_CUDA_F16
        }
    }

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (tid == 0) {
#ifdef GGML_CUDA_F16
        dst[row] = tmp.x + tmp.y;
#else
        dst[row] = tmp;
#endif // GGML_CUDA_F16
    }
}

static __global__ void mul_mat_p021_f16_f32(
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {

    const half * x = (const half *) vx;

    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / (nchannels_y / nchannels_x);

    const int nrows_y = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst = row_x;

    float tmp = 0.0f;

    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;

        if (col_x >= ncols_x) {
            break;
        }

        // x is transposed and permuted
        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
        const float xi = __half2float(x[ix]);

        const int row_y = col_x;

        // y is not transposed but permuted
        const int iy = channel*nrows_y + row_y;

        tmp += xi * y[iy];
    }

    // dst is not transposed and not permuted
    const int idst = channel*nrows_dst + row_dst;

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
}

static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {

    const half * x = (const half *) vx;

    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / channel_x_divisor;

    const int nrows_y   = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst   = row_x;

    const int idst = channel*nrows_dst + row_dst;

    float tmp = 0.0f;

    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;

        if (col_x >= ncols_x) {
            break;
        }

        const int row_y = col_x;

        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const int iy = channel*nrows_y + row_y;

        const float xi = __half2float(x[ix]);

        tmp += xi * y[iy];
    }

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
}

static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    float * dsti = (float *) cdsti;

    *dsti = *xi;
}

static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    half * dsti = (half *) cdsti;

    *dsti = __float2half(*xi);
}

static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
    const half * xi = (const half *) cxi;
    half * dsti = (half *) cdsti;

    *dsti = *xi;
}

static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
    const half * xi = (const half *) cxi;
    float * dsti = (float *) cdsti;

    *dsti = *xi;
}

template <cpy_kernel_t cpy_1>
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13) {
    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
    }

    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
    // then combine those indices with the corresponding byte offsets to get the total offsets
    const int64_t i03 = i/(ne00 * ne01 * ne02);
    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

    const int64_t i13 = i/(ne10 * ne11 * ne12);
    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;

    cpy_1(cx + x_offset, cdst + dst_offset);
}

static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;

    float amax = 0.0f; // absolute max

    for (int j = 0; j < QK8_0; j++) {
        const float v = xi[j];
        amax = fmaxf(amax, fabsf(v));
    }

    const float d = amax / ((1 << 7) - 1);
    const float id = d ? 1.0f/d : 0.0f;

    dsti->d = d;

    for (int j = 0; j < QK8_0; ++j) {
        const float x0 = xi[j]*id;

        dsti->qs[j] = roundf(x0);
    }
}

static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_0 * dsti = (block_q4_0 *) cdsti;

    float amax = 0.0f;
    float vmax = 0.0f;

    for (int j = 0; j < QK4_0; ++j) {
        const float v = xi[j];
        if (amax < fabsf(v)) {
            amax = fabsf(v);
            vmax = v;
        }
    }

    const float d  = vmax / -8;
    const float id = d ? 1.0f/d : 0.0f;

    dsti->d = d;

    for (int j = 0; j < QK4_0/2; ++j) {
        const float x0 = xi[0       + j]*id;
        const float x1 = xi[QK4_0/2 + j]*id;

        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));

        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
}

static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_1 * dsti = (block_q4_1 *) cdsti;

    float vmin = FLT_MAX;
    float vmax = -FLT_MAX;

    for (int j = 0; j < QK4_1; ++j) {
        const float v = xi[j];

        if (v < vmin) vmin = v;
        if (v > vmax) vmax = v;
    }

    const float d  = (vmax - vmin) / ((1 << 4) - 1);
    const float id = d ? 1.0f/d : 0.0f;

    dsti->dm.x = d;
    dsti->dm.y = vmin;

    for (int j = 0; j < QK4_1/2; ++j) {
        const float x0 = (xi[0       + j] - vmin)*id;
        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;

        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));

        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
}

template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                 const int nb12, const int nb13) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

    const int i03 = i/(ne00 * ne01 * ne02);
    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

    const int i13 = i/(ne10 * ne11 * ne12);
    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

    cpy_blck(cx + x_offset, cdst + dst_offset);
}

static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
}

struct rope_corr_dims {
    float v[4];
};

// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static __device__ void rope_yarn(
    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
    float * cos_theta, float * sin_theta
) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
    }
    *cos_theta = cosf(theta) * mscale;
    *sin_theta = sinf(theta) * mscale;
}

// rope == RoPE == rotary positional embedding
template<typename T, bool has_pos>
static __global__ void rope(
    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims
) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
    const float theta_base = p*powf(freq_base, -float(col)/ncols);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + 1];

    dst[i + 0] = x0*cos_theta - x1*sin_theta;
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
}

template<typename T, bool has_pos>
static __global__ void rope_neox(
    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int ib = col / n_dims;
    const int ic = col % n_dims;

    if (ib > 0) {
        const int i = row*ncols + ib*n_dims + ic;

        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];

        return;
    }

    const int i  = row*ncols + ib*n_dims + ic/2;
    const int i2 = row/p_delta_rows;

    float cur_rot = inv_ndims * ic - ib;

    const int p = has_pos ? pos[i2] : 0;
    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + n_dims/2];

    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}

static __global__ void rope_glm_f32(
    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
    int n_ctx
) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;

    if (col >= half_n_dims) {
        return;
    }

    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
     // FIXME: this is likely wrong
    const int p = pos != nullptr ? pos[i2] : 0;

    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + half_n_dims];

    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);

    const float x2 = x[i + half_n_dims * 2];
    const float x3 = x[i + half_n_dims * 3];

    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
}

static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
                                 const int n_heads_log2_floor, const float m0, const float m1) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;

    const int k = row/k_rows;

    float m_k;
    if (k < n_heads_log2_floor) {
        m_k = powf(m0, k + 1);
    } else {
        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
    }

    dst[i] = col * m_k + x[i];
}

static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x;
    const int col = threadIdx.x;

    float sum = 0.0f;
    for (int i = col; i < ncols; i += blockDim.x) {
        sum += x[row * ncols + i];
    }

    sum = warp_reduce_sum(sum);

    if (col == 0) {
        dst[row] = sum;
    }
}

template<typename T>
static inline __device__ void swap(T & a, T & b) {
    T tmp = a;
    a = b;
    b = tmp;
}

template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
    // bitonic sort
    int col = threadIdx.x;
    int row = blockIdx.y;

    if (col >= ncols) return;

    const float * x_row = x + row * ncols;
    int * dst_row = dst + row * ncols;

    // initialize indices
    if (col < ncols) {
        dst_row[col] = col;
    }
    __syncthreads();

    for (int k = 2; k <= ncols; k *= 2) {
        for (int j = k / 2; j > 0; j /= 2) {
            int ixj = col ^ j;
            if (ixj > col) {
                if ((col & k) == 0) {
                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
                        swap(dst_row[col], dst_row[ixj]);
                    }
                } else {
                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
                        swap(dst_row[col], dst_row[ixj]);
                    }
                }
            }
            __syncthreads();
        }
    }
}

static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
    const int col = blockDim.y*blockIdx.y + threadIdx.y;
    const int row = blockDim.x*blockIdx.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

    const int i = row*ncols + col;
    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
}

template <bool vals_smem, int ncols_template, int block_size_template>
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;

    const int tid  = threadIdx.x;
    const int rowx = blockIdx.x;
    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension

    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;

    float slope = 0.0f;

    // ALiBi
    if (max_bias > 0.0f) {
        const int h = rowx/nrows_y; // head index

        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;

        slope = powf(base, exp);
    }

    extern __shared__ float data_soft_max_f32[];
    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
    // shared memory buffer to cache values between iterations:
    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;

    float max_val = -INFINITY;

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            break;
        }

        const int ix = rowx*ncols + col;
        const int iy = rowy*ncols + col;

        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);

        vals[col] = val;
        max_val = max(max_val, val);
    }

    // find the max value in the block
    max_val = warp_reduce_max(max_val);
    if (block_size > WARP_SIZE) {
        if (warp_id == 0) {
            buf_iw[lane_id] = -INFINITY;
        }
        __syncthreads();

        if (lane_id == 0) {
            buf_iw[warp_id] = max_val;
        }
        __syncthreads();

        max_val = buf_iw[lane_id];
        max_val = warp_reduce_max(max_val);
    }

    float tmp = 0.0f; // partial sum

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            break;
        }

        const float val = expf(vals[col] - max_val);
        tmp += val;
        vals[col] = val;
    }

    // find the sum of exps in the block
    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        __syncthreads();
        if (warp_id == 0) {
            buf_iw[lane_id] = 0.0f;
        }
        __syncthreads();

        if (lane_id == 0) {
            buf_iw[warp_id] = tmp;
        }
        __syncthreads();

        tmp = buf_iw[lane_id];
        tmp = warp_reduce_sum(tmp);
    }

    const float inv_sum = 1.0f / tmp;

#pragma unroll
    for (int col0 = 0; col0 < ncols; col0 += block_size) {
        const int col = col0 + tid;

        if (ncols_template == 0 && col >= ncols) {
            return;
        }

        const int idst = rowx*ncols + col;
        dst[idst] = vals[col] * inv_sum;
    }
}

static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    dst[i] = scale * x[i];
}

static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
}

template <typename T>
static  __global__ void im2col_kernel(
        const float * x, T * dst, int64_t batch_offset,
        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
        int s0, int s1, int p0, int p1, int d0, int d1) {
    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i >= pelements) {
        return;
    }

    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
    const int64_t  kx = i / ksize;
    const int64_t  kd = kx * ksize;
    const int64_t  ky = (i - kd) / OW;
    const int64_t  ix = i % OW;

    const int64_t  oh = blockIdx.y;
    const int64_t  batch = blockIdx.z / IC;
    const int64_t  ic = blockIdx.z % IC;

    const int64_t iiw = ix * s0 + kx * d0 - p0;
    const int64_t iih = oh * s1 + ky * d1 - p1;

    const int64_t offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);

    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
        dst[offset_dst] = x[offset_src + iih * IW + iiw];
    }
}

template <typename Ti, typename To>
static  __global__ void pool2d_nchw_kernel(
        const int ih, const int iw, const int oh, const int ow,
        const int kh, const int kw, const int sh, const int sw,
        const int ph, const int pw, const int parallel_elements,
        const Ti* src, To* dst, const enum ggml_op_pool op) {
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        if (idx >= parallel_elements) {
            return;
        }

        const int I_HW = ih * iw;
        const int O_HW = oh * ow;
        const int nc = idx / O_HW;
        const int cur_oh = idx % O_HW / ow;
        const int cur_ow = idx % O_HW % ow;
        const Ti* i_ptr = src + nc * I_HW;
        To* o_ptr = dst + nc * O_HW;
        const int start_h = cur_oh * sh - ph;
        const int bh = max(0, start_h);
        const int eh = min(ih, start_h + kh);
        const int start_w = cur_ow * sw - pw;
        const int bw = max(0, start_w);
        const int ew = min(iw, start_w + kw);
        const To scale = 1. / (kh * kw);
        To res = 0;

        switch (op) {
            case GGML_OP_POOL_AVG: res = 0; break;
            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
        }

        for (int i = bh; i < eh; i += 1) {
            for (int j = bw; j < ew; j += 1) {
    #if __CUDA_ARCH__ >= 350
                Ti cur = __ldg(i_ptr + i * iw + j);
    #else
                Ti cur = i_ptr[i * iw + j];
    #endif
                switch (op) {
                    case GGML_OP_POOL_AVG: res += cur * scale; break;
                    case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
                }
            }
        }
        o_ptr[cur_oh * ow + cur_ow] = res;
}

template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

    GGML_TENSOR_BINARY_OP_LOCALS

    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    //const size_t s0 = nb0 / ggml_element_size(dst);
    const size_t s1 = nb1 / ggml_element_size(dst);
    const size_t s2 = nb2 / ggml_element_size(dst);
    const size_t s3 = nb3 / ggml_element_size(dst);

    const size_t s10 = nb10 / ggml_element_size(src1);
    const size_t s11 = nb11 / ggml_element_size(src1);
    const size_t s12 = nb12 / ggml_element_size(src1);
    //const size_t s13 = nb13 / ggml_element_size(src1);

    GGML_ASSERT(ne00 % 2 == 0);

    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
            src0_dd, src1_dd, dst_dd,
            ne00, /*ne01, ne02, ne03,*/
            /*ne10, ne11,*/ ne12, /*ne13,*/
            /* s0,*/ s1, s2, s3,
            /* nb00,*/ nb01, nb02, nb03,
            s10, s11, s12/*, s13*/);

    (void) dst;
}

template<typename src0_t>
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

    GGML_TENSOR_BINARY_OP_LOCALS

    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    //const size_t s0 = nb0 / ggml_element_size(dst);
    const size_t s1 = nb1 / ggml_element_size(dst);
    const size_t s2 = nb2 / ggml_element_size(dst);
    const size_t s3 = nb3 / ggml_element_size(dst);

    const size_t s10 = nb10 / ggml_element_size(src1);
    const size_t s11 = nb11 / ggml_element_size(src1);
    const size_t s12 = nb12 / ggml_element_size(src1);
    //const size_t s13 = nb13 / ggml_element_size(src1);

    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
            src0_dd, src1_dd, dst_dd,
            ne00, /*ne01, ne02, ne03,*/
            /*ne10, ne11,*/ ne12, /*ne13,*/
            /* s0,*/ s1, s2, s3,
            /* nb00,*/ nb01, nb02, nb03,
            s10, s11, s12/*, s13*/);

    (void) dst;
}

template<float (*bin_op)(const float, const float)>
struct bin_bcast_cuda {
    template<typename src0_t, typename src1_t, typename dst_t>
    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
            cudaStream_t stream) {

        GGML_TENSOR_BINARY_OP_LOCALS

        int nr0 = ne10/ne0;
        int nr1 = ne11/ne1;
        int nr2 = ne12/ne2;
        int nr3 = ne13/ne3;

        int nr[4] = { nr0, nr1, nr2, nr3 };

        // collapse dimensions until first broadcast dimension
        int64_t cne0[] = {ne0, ne1, ne2, ne3};
        int64_t cne1[] = {ne10, ne11, ne12, ne13};
        size_t cnb0[] = {nb0, nb1, nb2, nb3};
        size_t cnb1[] = {nb10, nb11, nb12, nb13};
        auto collapse = [](int64_t cne[]) {
            cne[0] *= cne[1];
            cne[1] = cne[2];
            cne[2] = cne[3];
            cne[3] = 1;
        };

        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
            cnb[1] *= cne[1];
            cnb[2] *= cne[2];
            cnb[3] *= cne[3];
        };

        for (int i = 0; i < 4; i++) {
            if (nr[i] != 1) {
                break;
            }
            if (i > 0) {
                collapse_nb(cnb0, cne0);
                collapse_nb(cnb1, cne1);
                collapse(cne0);
                collapse(cne1);
            }
        }
        {
            int64_t ne0 = cne0[0];
            int64_t ne1 = cne0[1];
            int64_t ne2 = cne0[2];
            int64_t ne3 = cne0[3];

            int64_t ne10 = cne1[0];
            int64_t ne11 = cne1[1];
            int64_t ne12 = cne1[2];
            int64_t ne13 = cne1[3];

            size_t nb0 = cnb0[0];
            size_t nb1 = cnb0[1];
            size_t nb2 = cnb0[2];
            size_t nb3 = cnb0[3];

            size_t nb10 = cnb1[0];
            size_t nb11 = cnb1[1];
            size_t nb12 = cnb1[2];
            size_t nb13 = cnb1[3];

            size_t s0 = nb0 / sizeof(dst_t);
            size_t s1 = nb1 / sizeof(dst_t);
            size_t s2 = nb2 / sizeof(dst_t);
            size_t s3 = nb3 / sizeof(dst_t);

            size_t s10 = nb10 / sizeof(src1_t);
            size_t s11 = nb11 / sizeof(src1_t);
            size_t s12 = nb12 / sizeof(src1_t);
            size_t s13 = nb13 / sizeof(src1_t);

            GGML_ASSERT(s0 == 1);
            GGML_ASSERT(s10 == 1);

            const int block_size = 128;

            int64_t hne0 = std::max(ne0/2LL, 1LL);

            dim3 block_dims;
            block_dims.x = std::min<unsigned int>(hne0, block_size);
            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);

            dim3 block_nums(
                (hne0 + block_dims.x - 1) / block_dims.x,
                (ne1 + block_dims.y - 1) / block_dims.y,
                (ne2*ne3 + block_dims.z - 1) / block_dims.z
            );

            if (block_nums.z > 65535) {
                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s10, */ s11, s12, s13);
            } else {
                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s10, */ s11, s12, s13);
            }
        }
    }
};

static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
    const int ne10, const int ne11, const int ne12,
    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
}

static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
}

static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}

static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
}

static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
    static const float eps = 1e-6f;
    if (group_size < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    }
}

static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne1, ne2);
    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
}

static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
                             const int scale_factor, cudaStream_t stream) {
    int ne0 = (ne00 * scale_factor);
    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
}

static void pad_f32_cuda(const float * x, float * dst,
    const int ne00, const int ne01, const int ne02, const int ne03,
    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne1, ne2*ne3);
    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
}

static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
}

static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
                                        const int dim, const int max_period, cudaStream_t stream) {
    int half_ceil = (dim + 1) / 2;
    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne00, 1);
    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
}

static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
}

static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
    const dim3 num_blocks(block_num_x, ky, 1);
    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
}

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
        const bool need_check = false;
        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
    } else {
        const bool need_check = true;
        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
    }
}

template<typename dst_t>
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
#if QK_K == 256
    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
#else
    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template<typename dst_t>
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
#if QK_K == 64
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
#else
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
    int id;
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            CUDA_CHECK(cudaGetDevice(&id));
            if (g_device_caps[id].cc >= CC_PASCAL) {
                return dequantize_block_q8_0_f16_cuda;
            }
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
        case GGML_TYPE_Q2_K:
            return dequantize_row_q2_K_cuda;
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_cuda;
        case GGML_TYPE_Q4_K:
            return dequantize_row_q4_K_cuda;
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_cuda;
        case GGML_TYPE_Q6_K:
            return dequantize_row_q6_K_cuda;
        case GGML_TYPE_IQ2_XXS:
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ2_S:
            return dequantize_row_iq2_s_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cuda<float>;
        default:
            return nullptr;
    }
}

static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
        case GGML_TYPE_Q2_K:
            return dequantize_row_q2_K_cuda;
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_cuda;
        case GGML_TYPE_Q4_K:
            return dequantize_row_q4_K_cuda;
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_cuda;
        case GGML_TYPE_Q6_K:
            return dequantize_row_q6_K_cuda;
        case GGML_TYPE_IQ2_XXS:
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ2_S:
            return dequantize_row_iq2_s_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cuda<half>;
        default:
            return nullptr;
    }
}

static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 1, 1);
    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
}

static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<1, 1, convert_f16>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}

template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
static void mul_mat_vec_q_cuda(
    const void * vx, const void * vy, float * dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {

    GGML_ASSERT(ncols_x % qk == 0);
    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    int64_t nwarps = 1;
    int64_t rows_per_cuda_block = 1;

    if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
        switch(ncols_y) {
            case 1:
                nwarps = 4;
                rows_per_cuda_block = 1;
                break;
            case 2:
            case 3:
            case 4:
                nwarps = 4;
                rows_per_cuda_block = 2;
                break;
            case 5:
            case 6:
            case 7:
            case 8:
                nwarps = 2;
                rows_per_cuda_block = 2;
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
    }
    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
    const dim3 block_nums(nblocks, 1, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    switch (ncols_y) {
        case 1:
            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 2:
            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 3:
            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 4:
            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 5:
            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 6:
            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 7:
            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 8:
            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }
}

static void ggml_mul_mat_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_0_RDNA2;
        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
        nwarps = NWARPS_Q4_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_0_RDNA1;
        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
        nwarps = NWARPS_Q4_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_0_AMPERE;
        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
        nwarps = NWARPS_Q4_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_0_PASCAL;
        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
        nwarps = NWARPS_Q4_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q4_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_1_RDNA2;
        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
        nwarps = NWARPS_Q4_1_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_1_RDNA1;
        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
        nwarps = NWARPS_Q4_1_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_1_AMPERE;
        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
        nwarps = NWARPS_Q4_1_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_1_PASCAL;
        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
        nwarps = NWARPS_Q4_1_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_0_RDNA2;
        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
        nwarps = NWARPS_Q5_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_0_RDNA1;
        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
        nwarps = NWARPS_Q5_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_0_AMPERE;
        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
        nwarps = NWARPS_Q5_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_0_PASCAL;
        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
        nwarps = NWARPS_Q5_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_1_RDNA2;
        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
        nwarps = NWARPS_Q5_1_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_1_RDNA1;
        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
        nwarps = NWARPS_Q5_1_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_1_AMPERE;
        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
        nwarps = NWARPS_Q5_1_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_1_PASCAL;
        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
        nwarps = NWARPS_Q5_1_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q8_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q8_0_RDNA2;
        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
        nwarps = NWARPS_Q8_0_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q8_0_RDNA1;
        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
        nwarps = NWARPS_Q8_0_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q8_0_AMPERE;
        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
        nwarps = NWARPS_Q8_0_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q8_0_PASCAL;
        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
        nwarps = NWARPS_Q8_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q2_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q2_K_RDNA2;
        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
        nwarps = NWARPS_Q2_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q2_K_RDNA1;
        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
        nwarps = NWARPS_Q2_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q2_K_AMPERE;
        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
        nwarps = NWARPS_Q2_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q2_K_PASCAL;
        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
        nwarps = NWARPS_Q2_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q3_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

#if QK_K == 256

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q3_K_RDNA2;
        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
        nwarps = NWARPS_Q3_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q3_K_RDNA1;
        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
        nwarps = NWARPS_Q3_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q3_K_AMPERE;
        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
        nwarps = NWARPS_Q3_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q3_K_PASCAL;
        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
        nwarps = NWARPS_Q3_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
#endif
}

static void ggml_mul_mat_q4_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_K_RDNA2;
        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
        nwarps = NWARPS_Q4_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q4_K_RDNA1;
        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
        nwarps = NWARPS_Q4_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_K_AMPERE;
        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
        nwarps = NWARPS_Q4_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_K_PASCAL;
        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
        nwarps = NWARPS_Q4_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q5_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q5_K_RDNA2;
        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
        nwarps = NWARPS_Q5_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q5_K_RDNA1;
        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
        nwarps = NWARPS_Q5_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q5_K_AMPERE;
        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
        nwarps = NWARPS_Q5_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q5_K_PASCAL;
        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
        nwarps = NWARPS_Q5_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_q6_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q6_K_RDNA2;
        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
        nwarps = NWARPS_Q6_K_RDNA2;
    } else if (compute_capability >= CC_OFFSET_AMD) {
        mmq_x  =  MMQ_X_Q6_K_RDNA1;
        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
        nwarps = NWARPS_Q6_K_RDNA1;
    } else if (compute_capability >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q6_K_AMPERE;
        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
        nwarps = NWARPS_Q6_K_AMPERE;
    } else if (compute_capability >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q6_K_PASCAL;
        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
        nwarps = NWARPS_Q6_K_PASCAL;
    } else {
        GGML_ASSERT(false);
    }

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
    const dim3 block_nums(block_num_x, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);

    if (nrows_x % mmq_y == 0) {
        const bool need_check = false;
        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    } else {
        const bool need_check = true;
        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
}

static void ggml_mul_mat_p021_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {

    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
}

static void ggml_mul_mat_vec_nc_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {

    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
}


static void ggml_cpy_f16_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q8_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q4_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}


static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
}

static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
}

template<typename T>
static void rope_cuda(
    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    if (pos == nullptr) {
        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    } else {
        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    }
}

template<typename T>
static void rope_neox_cuda(
    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.0f / n_dims;

    if (pos == nullptr) {
        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
            theta_scale, inv_ndims
        );
    } else {
        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
            theta_scale, inv_ndims
        );
    }
}

static void rope_glm_f32_cuda(
    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, int n_ctx, cudaStream_t stream
) {
    GGML_ASSERT(ncols % 4 == 0);
    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
}

static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
                           const int k_rows, const int n_heads_log2_floor, const float m0,
                           const float m1, cudaStream_t stream) {
    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
    const dim3 block_nums(num_blocks_x, nrows, 1);
    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
}

static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
}

static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
    // bitonic sort requires ncols to be power of 2
    GGML_ASSERT((ncols & (ncols - 1)) == 0);

    const dim3 block_dims(ncols, 1, 1);
    const dim3 block_nums(1, nrows, 1);
    if (order == GGML_SORT_ORDER_ASC) {
        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
    } else if (order == GGML_SORT_ORDER_DESC) {
        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
    } else {
        GGML_ASSERT(false);
    }
}

static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
    const dim3 block_nums(nrows_x, block_num_x, 1);
    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
}

static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
    int nth = WARP_SIZE;
    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
    const dim3 block_dims(nth,     1, 1);
    const dim3 block_nums(nrows_x, 1, 1);
    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");

    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));

    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

    if (shmem < g_device_caps[g_main_device].smpb) {
        switch (ncols_x) {
            case 32:
                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 64:
                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 128:
                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 256:
                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 512:
                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 1024:
                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 2048:
                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 4096:
                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            default:
                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
        }
    } else {
        const size_t shmem_low = WARP_SIZE*sizeof(float);
        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
    }
}

template <typename T>
static void im2col_cuda(const float* x, T* dst,
    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
    int64_t batch, int64_t batch_offset, int64_t offset_delta,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
    const int parallel_elements = OW * KW * KH;
    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
    dim3 block_nums(num_blocks, OH, batch * IC);
    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
}

// buffer pool for cuda
#define MAX_CUDA_BUFFERS 256

struct scoped_spin_lock {
    std::atomic_flag& lock;
    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
        while (lock.test_and_set(std::memory_order_acquire)) {
            ; // spin
        }
    }
    ~scoped_spin_lock() {
        lock.clear(std::memory_order_release);
    }
    scoped_spin_lock(const scoped_spin_lock&) = delete;
    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
};

static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

// #define DEBUG_CUDA_MALLOC
struct ggml_cuda_buffer {
    void * ptr = nullptr;
    size_t size = 0;
};

static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};

static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
#ifdef DEBUG_CUDA_MALLOC
    int nnz = 0;
    size_t max_size = 0;
#endif
    size_t best_diff = 1ull << 36;
    int ibest = -1;
    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
        if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
            ++nnz;
            if (b.size > max_size) max_size = b.size;
#endif
            if (b.size >= size) {
                size_t diff = b.size - size;
                if (diff < best_diff) {
                    best_diff = diff;
                    ibest = i;
                    if (!best_diff) {
                        void * ptr = b.ptr;
                        *actual_size = b.size;
                        b.ptr = nullptr;
                        b.size = 0;
                        return ptr;
                    }
                }
            }
        }
    }
    if (ibest >= 0) {
        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
        void * ptr = b.ptr;
        *actual_size = b.size;
        b.ptr = nullptr;
        b.size = 0;
        return ptr;
    }
    void * ptr;
    size_t look_ahead_size = (size_t) (1.05 * size);
    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
    ggml_cuda_set_device(device);
    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
    *actual_size = look_ahead_size;
    g_cuda_pool_size[device] += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
#endif
    return ptr;
}

static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
        if (b.ptr == nullptr) {
            b.ptr = ptr;
            b.size = size;
            return;
        }
    }
    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
    ggml_cuda_set_device(device);
    CUDA_CHECK(cudaFree(ptr));
    g_cuda_pool_size[device] -= size;
}

#if !defined(GGML_USE_HIPBLAS)
// pool with virtual memory
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
    const size_t alignment = 128;
    size = alignment * ((size + alignment - 1) / alignment);

    size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];

    if (size > avail) {
        // round up to the next multiple of the granularity
        size_t reserve_size = size - avail;
        const size_t granularity = g_device_caps[device].vmm_granularity;
        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);

        GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);

        // allocate more physical memory
        CUmemAllocationProp prop = {};
        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        prop.location.id = device;
        CUmemGenericAllocationHandle handle;
        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));

        // reserve virtual address space (if not already reserved)
        if (g_cuda_pool_addr[device] == 0) {
            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
        }

        // map at the end of the pool
        CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));

        // the memory allocation handle is no longer needed after mapping
        CU_CHECK(cuMemRelease(handle));

        // set access
        CUmemAccessDesc access = {};
        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        access.location.id = device;
        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));

        // add to the pool
        g_cuda_pool_size[device] += reserve_size;

        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
        //       (unsigned long long) (reserve_size/1024/1024));
    }

    GGML_ASSERT(g_cuda_pool_addr[device] != 0);

    void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
    *actual_size = size;
    g_cuda_pool_used[device] += size;

#ifdef DEBUG_CUDA_MALLOC
    printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
#endif

    return ptr;
}

static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

#ifdef DEBUG_CUDA_MALLOC
    printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
#endif

    g_cuda_pool_used[device] -= size;

    // all deallocations must be in reverse order of the allocations
    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
}

static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
    if (g_device_caps[device].vmm) {
        return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
    } else {
        return ggml_cuda_pool_malloc_leg(device, size, actual_size);
    }
}

static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
    if (g_device_caps[device].vmm) {
        ggml_cuda_pool_free_vmm(device, ptr, size);
    } else {
        ggml_cuda_pool_free_leg(device, ptr, size);
    }
}
#else
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
#endif // !defined(GGML_USE_HIPBLAS)

template<typename T>
struct cuda_pool_alloc {
    int device = -1;
    T * ptr = nullptr;
    size_t actual_size = 0;

    // size is in number of elements
    T * alloc(size_t size) {
        GGML_ASSERT(ptr == nullptr);
        CUDA_CHECK(cudaGetDevice(&device));
        ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
        return ptr;
    }

    cuda_pool_alloc(size_t size) {
        alloc(size);
    }

    ~cuda_pool_alloc() {
        if (ptr != nullptr) {
            ggml_cuda_pool_free(device, ptr, actual_size);
        }
    }

    T * get() {
        return ptr;
    }

    cuda_pool_alloc() = default;
    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
};

static bool g_cublas_loaded = false;

GGML_CALL bool ggml_cublas_loaded(void) {
    return g_cublas_loaded;
}

GGML_CALL void ggml_init_cublas() {
    static bool initialized = false;

    if (!initialized) {

#ifdef __HIP_PLATFORM_AMD__
        // Workaround for a rocBLAS bug when using multiple graphics cards:
        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
        rocblas_initialize();
        CUDA_CHECK(cudaDeviceSynchronize());
#endif

        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
            initialized = true;
            g_cublas_loaded = false;
            fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
            return;
        }

        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ)
        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
#else
        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
#endif
#if defined(CUDA_USE_TENSOR_CORES)
        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
#else
        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
#endif
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
            int device_vmm = 0;

#if !defined(GGML_USE_HIPBLAS)
            CUdevice device;
            CU_CHECK(cuDeviceGet(&device, id));
            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));

            if (device_vmm) {
                CUmemAllocationProp alloc_prop = {};
                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
                alloc_prop.location.id = id;
                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
            }
#endif // !defined(GGML_USE_HIPBLAS)
            g_device_caps[id].vmm = !!device_vmm;

            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

            g_default_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
            g_device_caps[id].smpb = prop.sharedMemPerBlock;
        }
        for (int id = 0; id < g_device_count; ++id) {
            g_default_tensor_split[id] /= total_vram;
        }

        for (int id = 0; id < g_device_count; ++id) {
            ggml_cuda_set_device(id);

            // create cuda streams
            for (int is = 0; is < MAX_STREAMS; ++is) {
                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
            }

            // create cublas handle
            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
        }

        // configure logging to stdout
        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

        initialized = true;
        g_cublas_loaded = true;
    }
}

GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
        return nullptr;
    }

    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();
        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
            size/1024.0/1024.0, cudaGetErrorString(err));
        return nullptr;
    }

    return ptr;
}

GGML_CALL void ggml_cuda_host_free(void * ptr) {
    CUDA_CHECK(cudaFreeHost(ptr));
}

static cudaError_t ggml_cuda_cpy_tensor_2d(
    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {

    cudaMemcpyKind kind;
    char * src_ptr;
    if (src->backend == GGML_BACKEND_TYPE_CPU) {
        kind = cudaMemcpyHostToDevice;
        src_ptr = (char *) src->data;
    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
        kind = cudaMemcpyDeviceToDevice;
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
        int id;
        CUDA_CHECK(cudaGetDevice(&id));
        src_ptr = (char *) extra->data_device[id];
    } else {
        GGML_ASSERT(false);
    }
    char * dst_ptr = (char *) dst;

    const int64_t ne0 = src->ne[0];
    const int64_t nb0 = src->nb[0];
    const int64_t nb1 = src->nb[1];
    const int64_t nb2 = src->nb[2];
    const int64_t nb3 = src->nb[3];
    const enum ggml_type type = src->type;
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
    int64_t i1_diff = i1_high - i1_low;

    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
    if (nb0 == ts && nb1 == ts*ne0/bs) {
        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
    } else if (nb0 == ts) {
        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
    } else {
        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
            const void * rx = (const void *) ((const char *) x + i1*nb1);
            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
            // pretend the row is a matrix with cols=1
            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
            if (r != cudaSuccess) return r;
        }
        return cudaSuccess;
    }
}

static void ggml_cuda_op_get_rows(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {

    GGML_ASSERT(src1->type == GGML_TYPE_I32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));

    const int32_t * src1_i32 = (const int32_t *) src1_d;

    switch (src0->type) {
        case GGML_TYPE_F16:
            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_F32:
            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q4_0:
            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q4_1:
            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q5_0:
            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q5_1:
            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        case GGML_TYPE_Q8_0:
            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
            break;
        default:
            // TODO: k-quants
            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
            GGML_ASSERT(false);
            break;
    }
}

template<class op>
static void ggml_cuda_op_bin_bcast(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
        GGML_ASSERT(false);
    }
}

static void ggml_cuda_op_repeat(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);

    (void) src1;
    (void) src1_d;
}

static void ggml_cuda_op_add(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_acc(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
    int offset = dst->op_params[3] / 4; // offset in bytes

    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);

    (void) dst;
}

static void ggml_cuda_op_mul(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_div(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
}

static void ggml_cuda_op_gelu(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_silu(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_gelu_quick(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_tanh(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_relu(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_hardsigmoid(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_hardswish(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_leaky_relu(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float negative_slope;
    memcpy(&negative_slope, dst->op_params, sizeof(float));

    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_sqr(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_group_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int num_groups = dst->op_params[0];
    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_concat(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
    }

    (void) src1;
    (void) dst;
}

static void ggml_cuda_op_upscale(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

    const int scale_factor = dst->op_params[0];

    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_pad(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

    pad_f32_cuda(src0_dd, dst_dd,
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_arange(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    float start;
    float stop;
    float step;
    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));

    int64_t steps = (int64_t)ceil((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);

    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);

    (void) src0;
    (void) src1;
    (void) src0_dd;
    (void) src1_dd;
}

static void ggml_cuda_op_timestep_embedding(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];

    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_mul_mat_q(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];

    const int64_t ne10 = src1->ne[0];
    GGML_ASSERT(ne10 % QK8_1 == 0);

    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_1:
            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_0:
            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_1:
            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q8_0:
            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q2_K:
            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q3_K:
            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_K:
            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_K:
            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        case GGML_TYPE_Q6_K:
            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    (void) src1;
    (void) dst;
    (void) src1_ddf_i;
}

static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
    int64_t min_compute_capability = INT_MAX;
    int64_t max_compute_capability = INT_MIN;
    for (int id = 0; id < g_device_count; ++id) {
        if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
            if (min_compute_capability > g_device_caps[id].cc) {
                min_compute_capability = g_device_caps[id].cc;
            }
            if (max_compute_capability < g_device_caps[id].cc) {
                max_compute_capability = g_device_caps[id].cc;
            }
        }
    }

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    switch(type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
        case GGML_TYPE_Q3_K:
            return min_compute_capability < CC_RDNA2 ? 128 : 64;
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        default:
            GGML_ASSERT(false);
    }
#else
    switch(type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
            return 64;
        case GGML_TYPE_F16:
        case GGML_TYPE_F32:
            return 1;
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q6_K:
            return 64;
        default:
            GGML_ASSERT(false);
    }
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
    const int64_t nrows = ggml_nrows(tensor);
    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);

    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
    *row_low -= *row_low % rounding;

    if (id == g_device_count - 1) {
        *row_high = nrows;
    } else {
        *row_high = nrows*tensor_split[id + 1];
        *row_high -= *row_high % rounding;
    }
}

static void ggml_cuda_op_mul_mat_vec_q(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

    const int64_t ne10 = src1->ne[0];
    GGML_ASSERT(ne10 % QK8_1 == 0);

    const int64_t ne0 = dst->ne[0];

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_1:
            mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_0:
            mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_1:
            mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q8_0:
            mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q2_K:
            mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q3_K:
            mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q4_K:
            mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q5_K:
            mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_Q6_K:
            mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_XXS:
            mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_XS:
            mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ2_S:
            mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ3_XXS:
            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ4_NL:
            mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ3_S:
            mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    (void) src1;
    (void) dst;
    (void) src1_ddf_i;
    (void) src1_ncols;
    (void) src1_padded_row_size;
}

static void ggml_cuda_op_dequantize_mul_mat_vec(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
#ifdef GGML_CUDA_F16
    cuda_pool_alloc<half> src1_dfloat_a;
    half * src1_dfloat = nullptr; // dfloat == half

    bool src1_convert_f16 =
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

    if (src1_convert_f16) {
        src1_dfloat = src1_dfloat_a.alloc(ne00);
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
    }
#else
    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
#endif // GGML_CUDA_F16

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_1:
            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_0:
            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_1:
            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q8_0:
            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q2_K:
            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q3_K:
            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_K:
            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_K:
            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q6_K:
            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_F16:
            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
    }

    (void) src1;
    (void) dst;
    (void) src1_ddq_i;
    (void) src1_ncols;
    (void) src1_padded_row_size;
}

static void ggml_cuda_op_mul_mat_cublas(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    GGML_ASSERT(src0_dd_i  != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
    GGML_ASSERT(dst_dd_i   != nullptr);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;

    int id;
    CUDA_CHECK(cudaGetDevice(&id));

    // the main device has a larger memory buffer to hold the results from all GPUs
    // ldc == nrows of the matrix that cuBLAS writes into
    int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;

    const int compute_capability = g_device_caps[id].cc;

    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        //printf("this branch\n");
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        cuda_pool_alloc<half> src0_as_f16;
        if (src0->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = row_diff*ne00;
            src0_as_f16.alloc(ne);
            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();

        cuda_pool_alloc<half> src1_as_f16;
        if (src1->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = src1_ncols*ne10;
            src1_as_f16.alloc(ne);
            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);

        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;

        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
        CUBLAS_CHECK(
            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
                                src1_ptr,       CUDA_R_16F, ne10,
                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
                    CUBLAS_COMPUTE_16F,
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));

        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
    } else {
        cuda_pool_alloc<float> src0_ddq_as_f32;
        cuda_pool_alloc<float> src1_ddq_as_f32;

        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
            src0_ddq_as_f32.alloc(row_diff*ne00);
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
        if (src1->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
            src1_ddq_as_f32.alloc(src1_ncols*ne10);
            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
        }

        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();

        const float alpha = 1.0f;
        const float beta = 0.0f;

        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
        CUBLAS_CHECK(
            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha, src0_ddf_i,  ne00,
                            src1_ddf1_i, ne10,
                    &beta,  dst_dd_i,    ldc));
    }

    (void) dst;
    (void) src1_ddq_i;
    (void) src1_padded_row_size;
}

static void ggml_cuda_op_rope(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(src0->type == dst->type);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne2 = dst->ne[2];
    const int64_t nrows = ggml_nrows(src0);

    //const int n_past      = ((int32_t *) dst->op_params)[0];
    const int n_dims      = ((int32_t *) dst->op_params)[1];
    const int mode        = ((int32_t *) dst->op_params)[2];
    const int n_ctx       = ((int32_t *) dst->op_params)[3];
    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];

    // RoPE alteration for extended context
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

    const int32_t * pos = nullptr;
    if ((mode & 1) == 0) {
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
        GGML_ASSERT(src1->ne[0] == ne2);
        pos = (const int32_t *) src1_dd;
    }

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

    rope_corr_dims corr_dims;
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);

    // compute
    if (is_glm) {
        GGML_ASSERT(false);
        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
    } else if (is_neox) {
        if (src0->type == GGML_TYPE_F32) {
            rope_neox_cuda(
                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_neox_cuda(
                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else {
            GGML_ASSERT(false);
        }
    } else {
        if (src0->type == GGML_TYPE_F32) {
            rope_cuda(
                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_cuda(
                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else {
            GGML_ASSERT(false);
        }
    }

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_alibi(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t nrows = ggml_nrows(src0);

    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

    //GGML_ASSERT(ne01 + n_past == ne00);
    GGML_ASSERT(n_head == ne02);

    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));

    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);

    (void) src1;
    (void) src1_dd;
}

static void ggml_cuda_op_pool2d(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int32_t * opts = (const int32_t *)dst->op_params;
    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
    const int k0 = opts[1];
    const int k1 = opts[2];
    const int s0 = opts[3];
    const int s1 = opts[4];
    const int p0 = opts[5];
    const int p1 = opts[6];

    const int64_t IH = src0->ne[1];
    const int64_t IW = src0->ne[0];

    const int64_t N = dst->ne[3];
    const int64_t OC = dst->ne[2];
    const int64_t OH = dst->ne[1];
    const int64_t OW = dst->ne[0];

    const int parallel_elements = N * OC * OH * OW;
    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
    dim3 block_nums(num_blocks);
    pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);

    (void) src1;
    (void) src1_dd;
}

static void ggml_cuda_op_im2col(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

    const int64_t IC = src1->ne[is_2D ? 2 : 1];
    const int64_t IH = is_2D ? src1->ne[1] : 1;
    const int64_t IW =         src1->ne[0];

    const int64_t KH = is_2D ? src0->ne[1] : 1;
    const int64_t KW =         src0->ne[0];

    const int64_t OH = is_2D ? dst->ne[2] : 1;
    const int64_t OW =         dst->ne[1];

    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
    const int64_t batch = src1->ne[3];
    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

    if(dst->type == GGML_TYPE_F16) {
        im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
    } else {
        im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
    }

    (void) src0;
    (void) src0_dd;
}

static void ggml_cuda_op_sum_rows(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_argsort(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);

    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_diag_mask_inf(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int nrows0 = ggml_nrows(src0);

    const int n_past = ((int32_t *) dst->op_params)[0];

    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_soft_max(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional

    const int64_t ne00    = src0->ne[0];
    const int64_t nrows_x = ggml_nrows(src0);
    const int64_t nrows_y = src0->ne[1];

    float scale    = 1.0f;
    float max_bias = 0.0f;

    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));

    // positions tensor
    float * src2_dd = nullptr;
    cuda_pool_alloc<float> src2_f;

    ggml_tensor * src2 = dst->src[2];
    const bool use_src2 = src2 != nullptr;

    if (use_src2) {
        const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;

        if (src2_on_device) {
            ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
            src2_dd = (float *) src2_extra->data_device[g_main_device];
        } else {
            src2_dd = src2_f.alloc(ggml_nelements(src2));
            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
        }
    }

    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
}

static void ggml_cuda_op_scale(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float scale;
    memcpy(&scale, dst->op_params, sizeof(float));

    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_clamp(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float min;
    float max;
    memcpy(&min, dst->op_params, sizeof(float));
    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));

    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());

    (void) src1;
    (void) dst;
    (void) src1_dd;
}

static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
    const int64_t nrows0 = ggml_nrows(src0);

    const bool use_src1 = src1 != nullptr;
    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;

    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);

    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;

    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;

    // dd = data device
    float * src0_ddf = nullptr;
    float * src1_ddf = nullptr;
    float *  dst_ddf = nullptr;

    cuda_pool_alloc<float> src0_f;
    cuda_pool_alloc<float> src1_f;
    cuda_pool_alloc<float>  dst_f;

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    if (src0_on_device) {
        src0_ddf = (float *) src0_extra->data_device[g_main_device];
    } else {
        src0_ddf = src0_f.alloc(ggml_nelements(src0));
        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
    }

    if (use_src1) {
        if (src1_on_device) {
            src1_ddf = (float *) src1_extra->data_device[g_main_device];
        } else {
            src1_ddf = src1_f.alloc(ggml_nelements(src1));
            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
        }
    }
    if (dst_on_device) {
        dst_ddf = (float *) dst_extra->data_device[g_main_device];
    } else {
        dst_ddf = dst_f.alloc(ggml_nelements(dst));
    }

    // do the computation
    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
    CUDA_CHECK(cudaGetLastError());

    // copy dst to host if necessary
    if (!dst_on_device) {
        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
    }

    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        CUDA_CHECK(cudaDeviceSynchronize());
    }
}

static void ggml_cuda_set_peer_access(const int n_tokens) {
    static bool peer_access_enabled = false;

    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;

    if (peer_access_enabled == enable_peer_access) {
        return;
    }

#ifdef NDEBUG
    for (int id = 0; id < g_device_count; ++id) {
        ggml_cuda_set_device(id);
        CUDA_CHECK(cudaDeviceSynchronize());
    }

    for (int id = 0; id < g_device_count; ++id) {
        ggml_cuda_set_device(id);

        for (int id_other = 0; id_other < g_device_count; ++id_other) {
            if (id == id_other) {
                continue;
            }
            if (id != g_main_device && id_other != g_main_device) {
                continue;
            }

            int can_access_peer;
            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
            if (can_access_peer) {
                if (enable_peer_access) {
                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
                        CUDA_CHECK(err);
                    }
                } else {
                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                    if (err != cudaErrorPeerAccessNotEnabled) {
                        CUDA_CHECK(err);
                    }
                }
            }
        }
    }
#endif // NDEBUG

    peer_access_enabled = enable_peer_access;
}

// FIXME: move this somewhere else
struct ggml_backend_cuda_split_buffer_type_context {
    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
};

static void ggml_cuda_op_mul_mat(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
    const bool convert_src1_to_q8_1) {

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nrows1 = ggml_nrows(src1);

    GGML_ASSERT(ne03 == ne13);

    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

    const int nb2 = dst->nb[2];
    const int nb3 = dst->nb[3];

    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));

    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);

    const int64_t i02_divisor = ne12 / ne02;

    const size_t src0_ts = ggml_type_size(src0->type);
    const size_t src0_bs = ggml_blck_size(src0->type);
    const size_t q8_1_ts = sizeof(block_q8_1);
    const size_t q8_1_bs = QK8_1;

    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;

    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    const bool src0_is_contiguous = ggml_is_contiguous(src0);
    const bool src1_is_contiguous = ggml_is_contiguous(src1);

    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);

    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    GGML_ASSERT(!(split && ne02 > 1));
    GGML_ASSERT(!(split && ne03 > 1));
    GGML_ASSERT(!(split && ne02 < ne12));

    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
    if (split) {
        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        tensor_split = buft_ctx->tensor_split;
    }

    struct dev_data {
        cuda_pool_alloc<char>  src0_dd_alloc;
        cuda_pool_alloc<float> src1_ddf_alloc;
        cuda_pool_alloc<char>  src1_ddq_alloc;
        cuda_pool_alloc<float>   dst_dd_alloc;

        char  *  src0_dd = nullptr;
        float * src1_ddf = nullptr; // float
        char  * src1_ddq = nullptr; // q8_1
        float *   dst_dd = nullptr;

        int64_t  row_low;
        int64_t row_high;
    };

    dev_data dev[GGML_CUDA_MAX_DEVICES];

    int used_devices = 0;

    for (int id = 0; id < g_device_count; ++id) {
        // by default, use all rows
        dev[id].row_low  = 0;
        dev[id].row_high = ne01;

        // for multi GPU, get the row boundaries from tensor split
        // and round to mul_mat_q tile sizes
        if (split) {
            const int64_t rounding = get_row_rounding(src0->type, tensor_split);

            if (id != 0) {
                dev[id].row_low  = ne01*tensor_split[id];
                if (dev[id].row_low < ne01) {
                    dev[id].row_low -= dev[id].row_low % rounding;
                }
            }

            if (id != g_device_count - 1) {
                dev[id].row_high  = ne01*tensor_split[id + 1];
                if (dev[id].row_high < ne01) {
                    dev[id].row_high -= dev[id].row_high % rounding;
                }
            }
        }
    }

    for (int id = 0; id < g_device_count; ++id) {
        if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
            continue;
        }

        used_devices++;

        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;

        ggml_cuda_set_device(id);
        cudaStream_t stream = g_cudaStreams[id][0];

        if (src0_on_device && src0_is_contiguous) {
            dev[id].src0_dd = (char *) src0_extra->data_device[id];
        } else {
            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
        }

        if (src1_on_device && src1_is_contiguous) {
            dev[id].src1_ddf = (float *) src1_extra->data_device[id];
        } else {
            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
        }

        if (convert_src1_to_q8_1) {
            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

            if (src1_on_device && src1_is_contiguous) {
                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                CUDA_CHECK(cudaGetLastError());
            }
        }

        if (dst_on_device) {
            dev[id].dst_dd = (float *) dst_extra->data_device[id];
        } else {
            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
        }
    }

    // if multiple devices are used they need to wait for the main device
    // here an event is recorded that signals that the main device has finished calculating the input data
    if (split && used_devices > 1) {
        ggml_cuda_set_device(g_main_device);
        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
    }

    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

        for (int id = 0; id < g_device_count; ++id) {
            if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
                continue;
            }

            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
            const int64_t row_diff = dev[id].row_high - dev[id].row_low;

            ggml_cuda_set_device(id);
            cudaStream_t stream = g_cudaStreams[id][is];

            // wait for main GPU data if necessary
            if (split && (id != g_main_device || is != 0)) {
                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
            }

            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
                const int64_t i03 = i0 / ne12;
                const int64_t i02 = i0 % ne12;

                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;

                // for split tensors the data begins at i0 == i0_offset_low
                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);

                // the main device memory buffer can be on VRAM scratch, with space for all partial results
                // in that case an offset on dst_ddf_i is needed
                if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
                }

                // copy src0, src1 to device if necessary
                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
                    if (id != g_main_device) {
                        if (convert_src1_to_q8_1) {
                            char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
                        } else {
                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
                                                            src1_ncols*ne10*sizeof(float), stream));
                        }
                    }
                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                } else {
                    GGML_ASSERT(false);
                }

                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                    CUDA_CHECK(cudaGetLastError());
                }

                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
                }

                // do the computation
                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
                CUDA_CHECK(cudaGetLastError());

                // copy dst to host or other device if necessary
                if (!dst_on_device) {
                    void * dst_off_device;
                    cudaMemcpyKind kind;
                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                        dst_off_device = dst->data;
                        kind = cudaMemcpyDeviceToHost;
                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
                        dst_off_device = dst_extra->data_device[g_main_device];
                        kind = cudaMemcpyDeviceToDevice;
                    } else {
                        GGML_ASSERT(false);
                    }
                    if (split) {
                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
                        // dst is NOT transposed.
                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
#if !defined(GGML_USE_HIPBLAS)
                        if (kind == cudaMemcpyDeviceToDevice) {
                            // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
                            cudaMemcpy3DPeerParms p = {};
                            p.dstDevice = g_main_device;
                            p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
                            p.srcDevice = id;
                            p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
                            p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
                            CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
                        } else
#endif
                        {
                            CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
                                                            dst_dd_i, row_diff*sizeof(float),
                                                            row_diff*sizeof(float), src1_ncols,
                                                            kind, stream));
                        }
                    } else {
                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                        dhf_dst_i += src1_col_0*ne0;
                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
                    }
                }

                // add event for the main device to wait on until other device is done
                if (split && (id != g_main_device || is != 0)) {
                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
                }
            }
        }
    }

    // main device waits for all other devices to be finished
    if (split && g_device_count > 1) {
        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;

        ggml_cuda_set_device(g_main_device);
        for (int id = 0; id < g_device_count; ++id) {
            if (dev[id].row_low == dev[id].row_high) {
                continue;
            }
            for (int64_t is = 0; is < is_max; ++is) {
                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
            }
        }
    }

    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        ggml_cuda_set_device(g_main_device);
        CUDA_CHECK(cudaDeviceSynchronize());
    }
}

static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
}

static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
}

static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
}

static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
}

static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
}

static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
}

static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
}

static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
}

static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
}

static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
}

static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
}

static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
}

static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
}
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
}

static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
}

static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
}

static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
}

static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
}

static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
}

static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
}

static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;

    const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;

    // dd = data device
    float * src0_ddf = nullptr;
    float * src1_ddf = nullptr;
    float *  dst_ddf = nullptr;

    cuda_pool_alloc<float>  dst_f;

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    if (dst_on_device) {
        dst_ddf = (float *) dst_extra->data_device[g_main_device];
    } else {
        dst_ddf = dst_f.alloc(ggml_nelements(dst));
    }

    // do the computation
    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
    CUDA_CHECK(cudaGetLastError());

    // copy dst to host if necessary
    if (!dst_on_device) {
        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
    }

    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        CUDA_CHECK(cudaDeviceSynchronize());
    }
}

static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
}

static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
}

GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    if (!g_cublas_loaded) return false;

    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

    // TODO: find the optimal values for these
    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
            src1->type == GGML_TYPE_F32 &&
             dst->type == GGML_TYPE_F32 &&
            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
}

static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    const int64_t ne12 = src1->ne[2];

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
}

static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];

    const int64_t ne12 = src1->ne[2];

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    const int64_t row_stride_x = nb01 / sizeof(half);
    const int64_t channel_stride_x = nb02 / sizeof(half);

    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
}

static __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
        int64_t ne12, int64_t ne13,
        int64_t ne23,
        size_t  nb02, size_t  nb03,
        size_t  nb12, size_t  nb13,
        size_t  nbd2, size_t  nbd3,
        int64_t r2,   int64_t r3) {
    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;

    if (i13 >= ne13 || i12 >= ne12) {
        return;
    }

    int64_t i03 = i13 / r3;
    int64_t i02 = i12 / r2;

    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
}

static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));

    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int64_t ne_dst = ggml_nelements(dst);

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));

    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
    half * src0_f16 = (half *) src0_ddq;

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    // convert src1 to fp16
    cuda_pool_alloc<half> src1_f16_alloc;
    if (src1->type != GGML_TYPE_F16) {
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
        const int64_t ne_src1 = ggml_nelements(src1);
        src1_f16_alloc.alloc(ne_src1);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
    }
    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();

    cuda_pool_alloc<half> dst_f16;
    char * dst_t;

    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
    cudaDataType_t      cu_data_type    = CUDA_R_16F;

    // dst strides
    size_t nbd2 = dst->nb[2];
    size_t nbd3 = dst->nb[3];

    const half  alpha_f16 = 1.0f;
    const half  beta_f16  = 0.0f;

    const float alpha_f32 = 1.0f;
    const float beta_f32  = 0.0f;

    const void * alpha = &alpha_f16;
    const void * beta  = &beta_f16;

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        dst_t = (char *) dst_f16.alloc(ne_dst);

        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
    } else {
        dst_t = (char *) dst_ddf;

        cu_compute_type = CUBLAS_COMPUTE_32F;
        cu_data_type    = CUDA_R_32F;

        alpha = &alpha_f32;
        beta  = &beta_f32;
    }

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

#if 0
    // use cublasGemmEx
    {
        for (int i13 = 0; i13 < ne13; ++i13) {
            for (int i12 = 0; i12 < ne12; ++i12) {
                int i03 = i13 / r3;
                int i02 = i12 / r2;

                CUBLAS_CHECK(
                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                            ne01, ne11, ne10,
                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
                            cu_compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
            }
        }
    }
#else
    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    } else {
        // use cublasGemmBatchedEx
        const int ne23 = ne12*ne13;

        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);

        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
                src0_f16, src1_f16, dst_t,
                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
                ne23,
                nb02, nb03,
                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
                nbd2, nbd3,
                r2, r3);
        CUDA_CHECK(cudaGetLastError());

        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
                ne23,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    }
#endif

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
    }
}

static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
        (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
        (src1->backend == GGML_BACKEND_TYPE_GPU) &&
        ( dst->backend == GGML_BACKEND_TYPE_GPU);

    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;

    int64_t min_compute_capability = INT_MAX;

    bool any_pascal_with_slow_fp16 = false;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
        for (int id = 0; id < g_device_count; ++id) {
            // skip devices that are not going to do any work:
            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
                continue;
            }

            if (min_compute_capability > g_device_caps[id].cc) {
                min_compute_capability = g_device_caps[id].cc;
            }
            if (g_device_caps[id].cc == 610) {
                any_pascal_with_slow_fp16 = true;
            }
        }
    } else {
        min_compute_capability    = g_device_caps[g_main_device].cc;
        any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
    }

    // check data types and tensor shapes for custom matrix multiplication kernels:
    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;

    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;

#ifdef CUDA_USE_TENSOR_CORES
    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES

#else

    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;

    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;

#ifdef CUDA_USE_TENSOR_CORES
    // when tensor cores are available, use them for large batch size
    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // CUDA_USE_TENSOR_CORES

#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    // if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
    } else {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    }
}

#if 0
template<typename ... Srcs>
static __global__ void k_compute_batched_ptrs_id(
        const void ** ptrs_src, void ** ptrs_dst,
        int ne12, int ne13,
        int ne23,
        int nb02, int nb03,
        int nb12, int nb13,
        int nb2, int nb3,
        int r2, int r3,
        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
        const half * src1_f16, half * dst_f16,
        const int32_t * ids, const int id,
        Srcs... src0s) {

    int i = ids[id];

    half * src0_f16;
    const void * srcs_ar[] = { (const half *) src0s... };
    if (src0_type == GGML_TYPE_F16) {
        src0_f16 = (half *) srcs_ar[i];
    } else {
        src0_f16 = src0_as_f16;
        if (threadIdx.x == 0 && threadIdx.y == 0) {
            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
        }
    }

    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
    int i12 = blockIdx.y * blockDim.y + threadIdx.y;

    if (i13 >= ne13 || i12 >= ne12) {
        return;
    }

    int i03 = i13 / r3;
    int i02 = i12 / r2;

    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
}

static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
    const struct ggml_tensor * ids = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * src00 = dst->src[2];

    const int id = dst->op_params[0];

    GGML_ASSERT(!ggml_is_transposed(src00));
    GGML_ASSERT(!ggml_is_transposed(src1));

    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
    const int64_t ne01 = src00->ne[1];
    const int64_t ne02 = src00->ne[2];
    const int64_t ne03 = src00->ne[3];

    //const int64_t nb01 = src00->nb[1];
    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];

    //const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);

    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));

    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    //void * src0_ddq = src0_extra->data_device[g_main_device];
    //half * src0_as_f16 = (half *) src0_ddq;

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];

    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    // convert src1 to fp16
    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
    GGML_ASSERT(to_fp16_cuda != nullptr);

    size_t src1_as = 0;
    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);

    size_t dst_as = 0;
    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

    const half alpha_f16 = 1.0f;
    const half beta_f16  = 0.0f;

    // use cublasGemmBatchedEx
    const int ne23 = ne12*ne13;

    const void ** ptrs_src = nullptr;
          void ** ptrs_dst = nullptr;

    size_t ptrs_src_s = 0;
    size_t ptrs_dst_s = 0;

    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);

    int64_t src0_ne = ggml_nelements(src00);
    half * src0_as_f16 = nullptr;
    size_t src0_as = 0;
    if (src00->type != GGML_TYPE_F16) {
        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
    }

    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
    dim3 block_dims(ne13, ne12);
    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
            ptrs_src, ptrs_dst,
            ne12, ne13,
            ne23,
            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
            nb12, nb13,
            dst->nb[2], dst->nb[3],
            r2, r3,
            src00->type, src0_as_f16, src0_ne,
            src1_as_f16, dst_f16,
            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
    );
    CUDA_CHECK(cudaGetLastError());

    CUBLAS_CHECK(
    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
            ne01, ne11, ne10,
            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
            ne23,
            CUBLAS_COMPUTE_16F,
            CUBLAS_GEMM_DEFAULT_TENSOR_OP));

    if (src0_as != 0) {
        ggml_cuda_pool_free(src0_as_f16, src0_as);
    }
    if (ptrs_src_s != 0) {
        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
    }
    if (ptrs_dst_s != 0) {
        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
    }

    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);

    ggml_cuda_pool_free(src1_as_f16, src1_as);
    ggml_cuda_pool_free(dst_f16, dst_as);
}
#endif

static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#if 0
    ggml_cuda_mul_mat_id_cublas(dst);
    // TODO: mmq/mmv support
#endif

    const size_t nb11 = src1->nb[1];
    const size_t nb1  =  dst->nb[1];

    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
    const int32_t n_as = ((int32_t *) dst->op_params)[1];

    std::vector<char> ids_host(ggml_nbytes(ids));

    cudaStream_t stream = g_cudaStreams[g_main_device][0];

    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaStreamSynchronize(stream));
    } else {
        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
    }

    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;

    ggml_tensor_extra_gpu src1_row_extra;
    ggml_tensor_extra_gpu dst_row_extra;

    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;

    src1_row.backend = GGML_BACKEND_TYPE_GPU;
    dst_row.backend  = GGML_BACKEND_TYPE_GPU;

    src1_row.extra = &src1_row_extra;
    dst_row.extra = &dst_row_extra;

    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];

    if (src1->ne[1] == 1) {
        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);

        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            //int32_t row_id;
            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));

            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

            GGML_ASSERT(row_id >= 0 && row_id < n_as);

            const struct ggml_tensor * src0_row = dst->src[row_id + 2];

            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?

            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?

            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
        }
    } else {
        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));

        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();

        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;

        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
            const struct ggml_tensor * src0_row = dst->src[row_id + 2];

            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

                if (row_id_i != row_id) {
                    continue;
                }

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
                                        nb11, src1_kind, stream));
                num_src1_rows++;
            }

            if (num_src1_rows == 0) {
                continue;
            }

            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;

            src1_row.nb[1] = nb11;
            src1_row.nb[2] = num_src1_rows*nb11;
            src1_row.nb[3] = num_src1_rows*nb11;

            dst_row.nb[1] = nb1;
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;

            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);

            num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

                if (row_id_i != row_id) {
                    continue;
                }

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
                                        nb1, dst_kind, stream));
                num_src1_rows++;
            }
        }
    }

    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}

static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
}

static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
}

static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);

    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];

    //GGML_ASSERT(src0->ne[3] == 1);

    const int64_t nb00 = src0->nb[0];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
    const int64_t nb03 = src0->nb[3];

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];

    //GGML_ASSERT(src1->ne[3] == 1);

    const int64_t nb10 = src1->nb[0];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2];
    const int64_t nb13 = src1->nb[3];

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;

    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];

    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
        GGML_ASSERT(false);
    }

    (void) dst;
}

static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // TODO: why do we pass dst as src1 here?
    ggml_cuda_cpy(src0, dst, nullptr);
    (void) src1;
}

static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
}

static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
}

static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
}

static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
}

static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
}

static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
}

static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0));
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
}

static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0));
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
}

static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    (void) src0;
    (void) src1;
    (void) dst;
}

static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
}

GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
    if (main_device >= g_device_count) {
        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                main_device, g_device_count, g_main_device);
        return;
    }

    if (g_main_device != main_device && g_device_count > 1) {
        g_main_device = main_device;
        //cudaDeviceProp prop;
        //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
        //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
    }
}

GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    if (!g_cublas_loaded) return false;

    ggml_cuda_func_t func;
    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);

    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
        return false;
    }

    if (tensor->op == GGML_OP_MUL_MAT) {
        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG
            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
#endif
            return false;
        }
    }

    switch (tensor->op) {
        case GGML_OP_REPEAT:
            func = ggml_cuda_repeat;
            break;
        case GGML_OP_GET_ROWS:
            func = ggml_cuda_get_rows;
            break;
        case GGML_OP_DUP:
            func = ggml_cuda_dup;
            break;
        case GGML_OP_ADD:
            func = ggml_cuda_add;
            break;
        case GGML_OP_ACC:
            func = ggml_cuda_acc;
            break;
        case GGML_OP_MUL:
            func = ggml_cuda_mul;
            break;
        case GGML_OP_DIV:
            func = ggml_cuda_div;
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(tensor)) {
                case GGML_UNARY_OP_GELU:
                    func = ggml_cuda_gelu;
                    break;
                case GGML_UNARY_OP_SILU:
                    func = ggml_cuda_silu;
                    break;
                case GGML_UNARY_OP_GELU_QUICK:
                    func = ggml_cuda_gelu_quick;
                    break;
                case GGML_UNARY_OP_TANH:
                    func = ggml_cuda_tanh;
                    break;
                case GGML_UNARY_OP_RELU:
                    func = ggml_cuda_relu;
                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
                    func = ggml_cuda_hardsigmoid;
                    break;
                case GGML_UNARY_OP_HARDSWISH:
                    func = ggml_cuda_hardswish;
                    break;
                default:
                    return false;
            }
            break;
        case GGML_OP_NORM:
            func = ggml_cuda_norm;
            break;
        case GGML_OP_GROUP_NORM:
            func = ggml_cuda_group_norm;
            break;
        case GGML_OP_CONCAT:
            func = ggml_cuda_concat;
            break;
        case GGML_OP_UPSCALE:
            func = ggml_cuda_upscale;
            break;
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
        case GGML_OP_ARANGE:
            func = ggml_cuda_arange;
            break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            func = ggml_cuda_timestep_embedding;
            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
        case GGML_OP_RMS_NORM:
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                return false;
            }
            func = ggml_cuda_mul_mat;
            break;
        case GGML_OP_MUL_MAT_ID:
            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
                return false;
            }
            func = ggml_cuda_mul_mat_id;
            break;
        case GGML_OP_SCALE:
            func = ggml_cuda_scale;
            break;
        case GGML_OP_SQR:
            func = ggml_cuda_sqr;
            break;
        case GGML_OP_CLAMP:
            func = ggml_cuda_clamp;
            break;
        case GGML_OP_CPY:
            func = ggml_cuda_cpy;
            break;
        case GGML_OP_CONT:
            func = ggml_cuda_dup;
            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
            func = ggml_cuda_nop;
            break;
        case GGML_OP_DIAG_MASK_INF:
            func = ggml_cuda_diag_mask_inf;
            break;
        case GGML_OP_SOFT_MAX:
            func = ggml_cuda_soft_max;
            break;
        case GGML_OP_ROPE:
            func = ggml_cuda_rope;
            break;
        case GGML_OP_ALIBI:
            func = ggml_cuda_alibi;
            break;
        case GGML_OP_IM2COL:
            func = ggml_cuda_im2col;
            break;
        case GGML_OP_POOL_2D:
            func = ggml_cuda_pool2d;
            break;
        case GGML_OP_SUM_ROWS:
            func = ggml_cuda_sum_rows;
            break;
        case GGML_OP_ARGSORT:
            func = ggml_cuda_argsort;
            break;
        default:
            return false;
    }

    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
    }

    if (params->ith != 0) {
        return true;
    }
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return true;
    }
    func(tensor->src[0], tensor->src[1], tensor);
    return true;
}

GGML_CALL int ggml_cuda_get_device_count() {
    int device_count;
    if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
        return 0;
    }
    return device_count;
}

GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
    cudaDeviceProp prop;
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
    snprintf(description, description_size, "%s", prop.name);
}

////////////////////////////////////////////////////////////////////////////////

// backend interface

#define UNUSED GGML_UNUSED

struct ggml_backend_cuda_context {
    int device;
    std::string name;
};

// cuda buffer

struct ggml_backend_cuda_buffer_context {
    int device;
    void * dev_ptr = nullptr;
    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
    size_t temp_tensor_extra_index = 0;
    std::string name;

    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
        device(device), dev_ptr(dev_ptr),
        name(GGML_CUDA_NAME + std::to_string(device)) {
    }

    ~ggml_backend_cuda_buffer_context() {
        delete[] temp_tensor_extras;
    }

    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
        // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
        if (temp_tensor_extras == nullptr) {
            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
        }

        size_t alloc_index = temp_tensor_extra_index;
        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
        memset(extra, 0, sizeof(*extra));

        return extra;
    }
};

GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    return ctx->name.c_str();
}

GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
}

GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    CUDA_CHECK(cudaFree(ctx->dev_ptr));
    delete ctx;
}

GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
    return ctx->dev_ptr;
}

GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    if (tensor->view_src != NULL && tensor->view_offs == 0) {
        assert(tensor->view_src->buffer->buft == buffer->buft);
        tensor->backend = tensor->view_src->backend;
        tensor->extra = tensor->view_src->extra;
        return;
    }

    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();

    extra->data_device[ctx->device] = tensor->data;

    tensor->backend = GGML_BACKEND_TYPE_GPU;
    tensor->extra = extra;

    if (ggml_is_quantized(tensor->type)) {
        // initialize padding to 0 to avoid possible NaN values
        size_t original_size = ggml_nbytes(tensor);
        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

        if (padded_size > original_size && tensor->view_src == nullptr) {
            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
        }
    }
}

GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);

    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaDeviceSynchronize());
}

GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);

    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaDeviceSynchronize());
}

GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
    if (ggml_backend_buffer_is_cuda(src->buffer)) {
        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

        ggml_cuda_set_device(src_ctx->device);
        CUDA_CHECK(cudaDeviceSynchronize());
        ggml_cuda_set_device(dst_ctx->device);
        CUDA_CHECK(cudaDeviceSynchronize());
        CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaDeviceSynchronize());

        return true;
    }
    return false;
}

GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
    CUDA_CHECK(cudaDeviceSynchronize());
}

static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
    /* .clear           = */ ggml_backend_cuda_buffer_clear,
    /* .reset           = */ NULL,
};

// cuda buffer type
struct ggml_backend_cuda_buffer_type_context {
    int device;
    std::string name;
};

GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;

    return ctx->name.c_str();
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;

    ggml_cuda_set_device(buft_ctx->device);

    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0

    void * dev_ptr;
    cudaError_t err = cudaMalloc(&dev_ptr, size);
    if (err != cudaSuccess) {
        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
        return nullptr;
    }

    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
}

GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 128;

    UNUSED(buft);
}

GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];

    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
    }

    return size;

    UNUSED(buft);
}

GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_cuda(backend)) {
        return false;
    }

    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return buft_ctx->device == cuda_ctx->device;
}

static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
    // FIXME: this is not thread safe
    if (device >= ggml_backend_cuda_get_device_count()) {
        return nullptr;
    }

    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];

    static bool ggml_backend_cuda_buffer_type_initialized = false;

    if (!ggml_backend_cuda_buffer_type_initialized) {
        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
            ggml_backend_cuda_buffer_types[i] = {
                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
            };
        }
        ggml_backend_cuda_buffer_type_initialized = true;
    }

    return &ggml_backend_cuda_buffer_types[device];
}

// cuda split buffer

struct ggml_backend_cuda_split_buffer_context {
    ~ggml_backend_cuda_split_buffer_context() {
        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
            for (int id = 0; id < g_device_count; ++id) {
                for (int64_t is = 0; is < MAX_STREAMS; ++is) {
                    if (extra->events[id][is] != nullptr) {
                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
                    }
                }
                if (extra->data_device[id] != nullptr) {
                    CUDA_CHECK(cudaFree(extra->data_device[id]));
                }
            }
            delete extra;
        }
    }

    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
};

GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
    return GGML_CUDA_NAME "_Split";

    UNUSED(buffer);
}

static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
    UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
}

GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
    delete ctx;
}

GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
    return (void *)0x1000;

    UNUSED(buffer);
}

GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported

    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];

    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};

    ctx->tensor_extras.push_back(extra);

    for (int id = 0; id < g_device_count; ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        // FIXME: do not crash if cudaMalloc fails
        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
        ggml_cuda_set_device(id);
        char * buf;
        CUDA_CHECK(cudaMalloc(&buf, size));

        // set padding to 0 to avoid possible NaN values
        if (size > original_size) {
            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
        }

        extra->data_device[id] = buf;

        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
        }
    }
    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
    tensor->extra = extra;
}

GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];
    const size_t nb1 = tensor->nb[1];
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int id = 0; id < g_device_count; ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        const size_t offset_split = row_low*nb1;
        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        const char * buf_host = (const char *)data + offset_split;
        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
    }
}

GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;

    const int64_t ne0 = tensor->ne[0];
    const size_t nb1 = tensor->nb[1];
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;

    for (int id = 0; id < g_device_count; ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        const size_t offset_split = row_low*nb1;
        size_t size = ggml_nbytes_split(tensor, nrows_split);
        const size_t original_size = size;

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }

        char * buf_host = (char *)data + offset_split;
        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
    }
}

GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    UNUSED(buffer);
    UNUSED(value);
}

static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
    /* .cpy_tensor      = */ NULL,
    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
    /* .reset           = */ NULL,
};

// cuda split buffer type

GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
    return GGML_CUDA_NAME "_Split";

    UNUSED(buft);
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
    // instead, we allocate them for each tensor separately in init_tensor
    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();

    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
}

GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 128;

    UNUSED(buft);
}

GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;

    size_t total_size = 0;

    const int64_t ne0 = tensor->ne[0];

    for (int id = 0; id < g_device_count; ++id) {
        int64_t row_low, row_high;
        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);

        int64_t nrows_split = row_high - row_low;
        if (nrows_split == 0) {
            continue;
        }

        total_size += ggml_nbytes_split(tensor, nrows_split);

        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
    }

    return total_size;
}

GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_cuda(backend);

    UNUSED(buft);
}

GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return false;

    UNUSED(buft);
}

static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
    // FIXME: this is not thread safe
    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};

    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
    if (all_zero) {
        tensor_split_arr = g_default_tensor_split;
    } else {
        float split_sum = 0.0f;
        for (int i = 0; i < g_device_count; ++i) {
            tensor_split_arr[i] = split_sum;
            split_sum += tensor_split[i];
        }
        for (int i = 0; i < g_device_count; ++i) {
            tensor_split_arr[i] /= split_sum;
        }
    }

    auto it = buft_map.find(tensor_split_arr);
    if (it != buft_map.end()) {
        return &it->second;
    }

    struct ggml_backend_buffer_type buft {
        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
    };

    auto result = buft_map.emplace(tensor_split_arr, buft);
    return &result.first->second;
}

// host buffer type

GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
    return GGML_CUDA_NAME "_Host";

    UNUSED(buft);
}

GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
    return GGML_CUDA_NAME "_Host";

    UNUSED(buffer);
}

GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_cuda_host_free(buffer->context);
}

GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    void * ptr = ggml_cuda_host_malloc(size);

    if (ptr == nullptr) {
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }

    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
    buffer->buft = buft;
    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

    return buffer;
}

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
        /* .iface    = */ {
            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
    };

    return &ggml_backend_cuda_buffer_type_host;
}

// backend

GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return cuda_ctx->name.c_str();
}

GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    delete cuda_ctx;
    delete backend;
}

GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
}

GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);

    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
}

GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);

    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
}

GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
        return true;
    }

    return false;
}

GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));

    UNUSED(backend);
}

GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_main_device(cuda_ctx->device);

    ggml_compute_params params = {};
    params.type = GGML_TASK_TYPE_COMPUTE;
    params.ith = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
            continue;
        }

#ifndef NDEBUG
        assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
        assert(node->extra != nullptr);

        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] != nullptr) {
                assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                assert(node->src[j]->extra != nullptr);
            }
        }
#endif

        bool ok = ggml_cuda_compute_forward(&params, node);
        if (!ok) {
            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
        }
        GGML_ASSERT(ok);
    }

    return GGML_STATUS_SUCCESS;
}

GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                    return true;
                default:
                    return false;
            }
            break;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
                struct ggml_tensor * a;
                struct ggml_tensor * b;
                if (op->op == GGML_OP_MUL_MAT) {
                    a = op->src[0];
                    b = op->src[1];
                } else {
                    a = op->src[2];
                    b = op->src[1];
                }
                if (a->ne[3] != b->ne[3]) {
                    return false;
                }
                ggml_type a_type = a->type;
                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                        return false;
                    }
                }
                return true;
            } break;
        case GGML_OP_GET_ROWS:
            {
                switch (op->src[0]->type) {
                    case GGML_TYPE_F16:
                    case GGML_TYPE_F32:
                    case GGML_TYPE_Q4_0:
                    case GGML_TYPE_Q4_1:
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                        return true;
                    default:
                        return false;
                }
            } break;
        case GGML_OP_CPY:
            {
                ggml_type src0_type = op->src[0]->type;
                ggml_type src1_type = op->src[1]->type;
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                return false;
            } break;
        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_CONCAT:
            {
                ggml_type src0_type = op->src[0]->type;
                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ROPE:
        case GGML_OP_ALIBI:
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default:
            return false;
    }

    UNUSED(backend);
}

static ggml_backend_i ggml_backend_cuda_interface = {
    /* .get_name                = */ ggml_backend_cuda_name,
    /* .free                    = */ ggml_backend_cuda_free,
    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
    /* .synchronize             = */ ggml_backend_cuda_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
    /* .supports_op             = */ ggml_backend_cuda_supports_op,
};

static ggml_guid_t ggml_backend_cuda_guid() {
    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
    return &guid;
}

GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
    ggml_init_cublas(); // TODO: remove from ggml.c

    if (device < 0 || device >= ggml_cuda_get_device_count()) {
        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
        return nullptr;
    }

    // not strictly necessary, but it may reduce the overhead of the first graph_compute
    ggml_cuda_set_main_device(device);

    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
        /* .device = */ device,
        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
    };

    ggml_backend_t cuda_backend = new ggml_backend {
        /* .guid      = */ ggml_backend_cuda_guid(),
        /* .interface = */ ggml_backend_cuda_interface,
        /* .context   = */ ctx
    };

    return cuda_backend;
}

GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
}

GGML_CALL int ggml_backend_cuda_get_device_count() {
    return ggml_cuda_get_device_count();
}

GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
    ggml_cuda_get_device_description(device, description, description_size);
}

GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
    ggml_cuda_set_device(device);

    CUDA_CHECK(cudaMemGetInfo(free, total));
}

// backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
    return cuda_backend;

    UNUSED(params);
}

extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();

GGML_CALL int ggml_backend_cuda_reg_devices() {
    int device_count = ggml_cuda_get_device_count();
    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
    for (int i = 0; i < device_count; i++) {
        char name[128];
        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
    }
    return device_count;
}
-												ggml : always define ggml_fp16_t as uint16_t (#5666)

* ggml : always define ggml_fp16_t as uint16_t

ggml-ci

* ggml : cont

ggml-ci

* ggml : cont

* ggml : cont

ggml-ci

* ggml : cont

ggml-ci

* cuda : no longer ggml headers last

ggml-ci

* ggml : fix q6_K FP16 -> FP32 conversion

ggml-ci

* ggml : more FP16 -> FP32 conversion fixes

ggml-ci
											
										
										
											2024-02-22 22:21:39 +01:00
+								#include "ggml-cuda.h"
 								#include "ggml.h"
 								#include "ggml-backend-impl.h"
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								#if defined(GGML_USE_HIPBLAS)
 								#define GGML_COMMON_DECL_HIP
 								#define GGML_COMMON_IMPL_HIP
 								#else
 								#define GGML_COMMON_DECL_CUDA
-												ggml : add ggml-common.h to deduplicate shared code (#5940)

* ggml : add ggml-common.h to shared code

ggml-ci

* scripts : update sync scripts

* sycl : reuse quantum tables

ggml-ci

* ggml : minor

* ggml : minor

* sycl : try to fix build
											
										
										
											2024-03-09 11:47:57 +01:00
+								#define GGML_COMMON_IMPL_CUDA
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								#endif
-												ggml : add ggml-common.h to deduplicate shared code (#5940)

* ggml : add ggml-common.h to shared code

ggml-ci

* scripts : update sync scripts

* sycl : reuse quantum tables

ggml-ci

* ggml : minor

* ggml : minor

* sycl : try to fix build
											
										
										
											2024-03-09 11:47:57 +01:00
+								#include "ggml-common.h"
-												llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params

ggml-ci

* fix metal build

* fix freq_base/scale default to model value

* llama-bench : keep the same model between tests when possible

* move n_threads to llama_context_params, add n_threads_batch

* fix mpi build

* remove kv_size(), cuda scratch fixes

* remove low-vram option

* add n_threads_batch to system info, refactor to get_system_info()

* add documentation about --threads-batch to the READMEs

* llama-bench fix

* main : fix rope freq/scale warning

* llama.cpp : add llama_get_model
common : add llama_tokenize from model

* remove duplicated ctx/model functions

ggml-ci

* cuda : print total VRAM used
											
										
										
											2023-09-28 21:42:38 +02:00
+								#include <algorithm>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								#include <assert.h>
 								#include <atomic>
 								#include <cinttypes>
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								#include <cstddef>
 								#include <cstdint>
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								#include <float.h>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								#include <limits>
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
+								#include <stdint.h>
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								#include <stdio.h>
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								#include <string>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								#include <vector>
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								#include <map>
 								#include <array>
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
 								#define STRINGIZE_IMPL(...) #__VA_ARGS__
 								#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#if defined(GGML_USE_HIPBLAS)
 								#include <hip/hip_runtime.h>
 								#include <hipblas/hipblas.h>
 								#include <hip/hip_fp16.h>
 								#ifdef __HIP_PLATFORM_AMD__
 								// for rocblas_initialize()
 								#include "rocblas/rocblas.h"
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#endif // __HIP_PLATFORM_AMD__
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 								#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 								#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define CUBLAS_OP_N HIPBLAS_OP_N
 								#define CUBLAS_OP_T HIPBLAS_OP_T
 								#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 								#define CUBLAS_TF32_TENSOR_OP_MATH 0
 								#define CUDA_R_16F  HIPBLAS_R_16F
 								#define CUDA_R_32F  HIPBLAS_R_32F
 								#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-												ggml-cuda: Fix HIP build (#4528)

regression of #4490
Adds defines for two new datatypes
cublasComputeType_t, cudaDataType_t.

Currently using deprecated hipblasDatatype_t since newer ones very recent.
											
										
										
											2023-12-18 22:33:45 +01:00
+								#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cublasCreate hipblasCreate
 								#define cublasGemmEx hipblasGemmEx
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								#define cublasGemmBatchedEx hipblasGemmBatchedEx
 								#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cublasHandle_t hipblasHandle_t
 								#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 								#define cublasSetStream hipblasSetStream
 								#define cublasSgemm hipblasSgemm
 								#define cublasStatus_t hipblasStatus_t
-												ggml-cuda: Fix HIP build (#4528)

regression of #4490
Adds defines for two new datatypes
cublasComputeType_t, cudaDataType_t.

Currently using deprecated hipblasDatatype_t since newer ones very recent.
											
										
										
											2023-12-18 22:33:45 +01:00
+								#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
+								#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 								#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 								#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaDeviceProp hipDeviceProp_t
 								#define cudaDeviceSynchronize hipDeviceSynchronize
 								#define cudaError_t hipError_t
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 23:40:26 +01:00
+								#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 								#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaEventCreateWithFlags hipEventCreateWithFlags
 								#define cudaEventDisableTiming hipEventDisableTiming
 								#define cudaEventRecord hipEventRecord
 								#define cudaEvent_t hipEvent_t
 								#define cudaEventDestroy hipEventDestroy
 								#define cudaFree hipFree
 								#define cudaFreeHost hipHostFree
 								#define cudaGetDevice hipGetDevice
 								#define cudaGetDeviceCount hipGetDeviceCount
 								#define cudaGetDeviceProperties hipGetDeviceProperties
 								#define cudaGetErrorString hipGetErrorString
 								#define cudaGetLastError hipGetLastError
-												cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)

* AMD ROCm: handle UMA memory VRAM expansions

This resolves #2797 by allowing ROCm AMD GPU users with a UMA to
dynamically expand the VRAM allocated to the GPU.

Without this, AMD ROCm users with shared CPU/GPU memory usually are
stuck with the BIOS-set (or fixed) framebuffer VRAM, making it
impossible to load more than 1-2 layers.

Note that the model is duplicated in RAM because it's loaded once for
the CPU and then copied into a second set of allocations that are
managed by the HIP UMA system. We can fix this later.

* clarify build process for ROCm on linux with cmake

* avoid using deprecated ROCm hipMallocHost

* keep simplifying the change required for UMA

* cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON
											
										
										
											2023-12-21 20:45:32 +01:00
+								#ifdef GGML_HIP_UMA
 								#define cudaMalloc hipMallocManaged
 								#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
 								#else
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaMalloc hipMalloc
 								#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-												cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)

* AMD ROCm: handle UMA memory VRAM expansions

This resolves #2797 by allowing ROCm AMD GPU users with a UMA to
dynamically expand the VRAM allocated to the GPU.

Without this, AMD ROCm users with shared CPU/GPU memory usually are
stuck with the BIOS-set (or fixed) framebuffer VRAM, making it
impossible to load more than 1-2 layers.

Note that the model is duplicated in RAM because it's loaded once for
the CPU and then copied into a second set of allocations that are
managed by the HIP UMA system. We can fix this later.

* clarify build process for ROCm on linux with cmake

* avoid using deprecated ROCm hipMallocHost

* keep simplifying the change required for UMA

* cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON
											
										
										
											2023-12-21 20:45:32 +01:00
+								#endif
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaMemcpy hipMemcpy
 								#define cudaMemcpyAsync hipMemcpyAsync
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
 								#define cudaMemcpy2DAsync hipMemcpy2DAsync
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 								#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 								#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 								#define cudaMemcpyKind hipMemcpyKind
 								#define cudaMemset hipMemset
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								#define cudaMemsetAsync hipMemsetAsync
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								#define cudaMemGetInfo hipMemGetInfo
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 								#define cudaSetDevice hipSetDevice
 								#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								#define cudaStreamFireAndForget hipStreamFireAndForget
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaStreamNonBlocking hipStreamNonBlocking
 								#define cudaStreamSynchronize hipStreamSynchronize
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#define cudaStream_t hipStream_t
 								#define cudaSuccess hipSuccess
-												ggml-cuda: Fix HIP build by adding define for __trap (#4569)

Regression of 139882392258671ffe5acdfcadc0bc08572d6eef
HIP doesn't have trap, only abort
											
										
										
											2023-12-21 20:13:25 +01:00
+								#define __trap abort
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 								#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
 								#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
 								#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
 								#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
 								#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
 								#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
 								#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 								#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#else
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								#include <cuda_runtime.h>
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#include <cuda.h>
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								#include <cublas_v2.h>
 								#include <cuda_fp16.h>
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								#if CUDART_VERSION < 11020
-												Fix new CUDA10 compilation errors (#4635)


											
										
										
											2023-12-26 11:38:36 +01:00
+								#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-												cuda : fix jetson compile error (#4560)

* fix old jetson compile error

* Update Makefile

* update jetson detect and cuda version detect

* update cuda marco define

* update makefile and cuda,fix some issue

* Update README.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update Makefile

* Update README.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-22 16:11:12 +01:00
+								#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
 								#define CUBLAS_COMPUTE_16F CUDA_R_16F
 								#define CUBLAS_COMPUTE_32F CUDA_R_32F
 								#define cublasComputeType_t cudaDataType_t
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#endif // CUDART_VERSION < 11020
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#endif // defined(GGML_USE_HIPBLAS)
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												CUDA: fix softmax compile for old CUDA versions (#4862)


											
										
										
											2024-01-12 12:30:41 +01:00
+								#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								#define CC_PASCAL     600
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#define CC_VOLTA      700
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define CC_OFFSET_AMD 1000000
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
+								#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
+								#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
-												ggml-cuda : increase max graph size (#4084)


											
										
										
											2023-11-15 13:58:13 +01:00
+								#define GGML_CUDA_MAX_NODES 8192
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
+								// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
 								// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
 								// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
 								// -  7B quantum model: +100-200 MB
 								// - 13B quantum model: +200-400 MB
 								//
 								//#define GGML_CUDA_FORCE_MMQ
 								// TODO: improve this to be correct for more hardware
 								//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
 								#if !defined(GGML_CUDA_FORCE_MMQ)
 								#define CUDA_USE_TENSOR_CORES
 								#endif
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
 								#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								#if defined(GGML_USE_HIPBLAS)
 								#define __CUDA_ARCH__ 1300
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
 								    defined(__gfx1150__) || defined(__gfx1151__)
 								#define RDNA3
 								#endif
 								#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
 								    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
 								#define RDNA2
 								#endif
-												cuda : vsubss4 for older versions of ROCm/clang (#2942)


											
										
										
											2023-09-01 23:33:19 +02:00
+								#ifndef __has_builtin
 								    #define __has_builtin(x) 0
 								#endif
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
 								    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
 								    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-												cuda : vsubss4 for older versions of ROCm/clang (#2942)


											
										
										
											2023-09-01 23:33:19 +02:00
+								#if __has_builtin(__builtin_elementwise_sub_sat)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    return reinterpret_cast<const int &>(c);
-												cuda : vsubss4 for older versions of ROCm/clang (#2942)


											
										
										
											2023-09-01 23:33:19 +02:00
+								#else
 								    int8x4_t c;
 								    int16_t tmp;
 								#pragma unroll
 								    for (int i = 0; i < 4; i++) {
 								        tmp = va[i] - vb[i];
 								        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
 								        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
 								        c[i] = tmp;
 								    }
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    return reinterpret_cast<int &>(c);
-												cuda : vsubss4 for older versions of ROCm/clang (#2942)


											
										
										
											2023-09-01 23:33:19 +02:00
+								#endif // __has_builtin(__builtin_elementwise_sub_sat)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								}
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								static __device__ __forceinline__ int __vsub4(const int a, const int b) {
 								    return __vsubss4(a, b);
 								}
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
 								    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
 								    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
 								    unsigned int c;
 								    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 								#pragma unroll
 								    for (int i = 0; i < 4; ++i) {
 								        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
 								    }
 								    return c;
 								}
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 								#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
 								    c = __builtin_amdgcn_sdot4(a, b, c, false);
-												ggml : use __builtin_amdgcn_sudot4 in __dp4a for gfx11 (#4787)


											
										
										
											2024-01-07 07:52:42 +01:00
+								#elif defined(RDNA3)
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 								#elif defined(__gfx1010__) || defined(__gfx900__)
 								    int tmp1;
 								    int tmp2;
 								    asm("\n \
 								        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
 								        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
 								        v_add3_u32 %0, %1, %2, %0 \n \
 								        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
 								        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
 								        v_add3_u32 %0, %1, %2, %0 \n \
 								        "
 								        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
 								        : "v"(a), "v"(b)
 								    );
 								#else
 								    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
 								    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 								    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
 								#endif
 								    return c;
 								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#endif // defined(GGML_USE_HIPBLAS)
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												ggml : fix warnings under MSVC (#1908)


											
										
										
											2023-06-17 17:46:15 +02:00
+								#if defined(_MSC_VER)
 								#pragma warning(disable: 4244 4267) // possible loss of data
 								#endif
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								[[noreturn]]
 								static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
 								    int id = -1; // in case cudaGetDevice fails
 								    cudaGetDevice(&id);
 								    fprintf(stderr, "CUDA error: %s\n", msg);
 								    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
 								    fprintf(stderr, "  %s\n", stmt);
 								    // abort with GGML_ASSERT to get a stack trace
 								    GGML_ASSERT(!"CUDA error");
 								}
 								#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
 								     do {                                                                           \
 								        auto err_ = (err);                                                          \
 								        if (err_ != (success)) {                                                    \
 								            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
 								        }                                                                           \
 								    } while (0)
 								#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
-												Fixed CUDA runtime version check (#1879)


											
										
										
											2023-06-15 21:49:08 +02:00
+								#if CUDART_VERSION >= 12000
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    static const char * cublas_get_error_str(const cublasStatus_t err) {
 								        return cublasGetStatusString(err);
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								#else
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    static const char * cublas_get_error_str(const cublasStatus_t err) {
 								        switch (err) {
 								            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
 								            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
 								            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
 								            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
 								            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
 								            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
 								            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
 								            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
 								            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
 								            default: return "unknown error";
 								        }
 								    }
 								#endif // CUDART_VERSION >= 12000
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								#if !defined(GGML_USE_HIPBLAS)
 								static const char * cu_get_error_str(CUresult err) {
 								    const char * err_str;
 								    cuGetErrorString(err, &err_str);
 								    return err_str;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#endif
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								#if CUDART_VERSION >= 11100
 								#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 								#else
 								#define GGML_CUDA_ASSUME(x)
 								#endif // CUDART_VERSION >= 11100
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								typedef half dfloat; // dequantize float
 								typedef half2 dfloat2;
 								#else
 								typedef float dfloat; // dequantize float
 								typedef float2 dfloat2;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif //GGML_CUDA_F16
 								static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    int x32 = 0;
 								    x32 |= x16[0] <<  0;
 								    x32 |= x16[1] << 16;
 								    return x32;
 								}
 								static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    int x32 = 0;
 								    x32 |= x16[0] <<  0;
 								    x32 |= x16[1] << 16;
 								    return x32;
 								}
 								static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								template<typename T>
 								using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
 								typedef to_t_cuda_t<float> to_fp32_cuda_t;
 								typedef to_t_cuda_t<half> to_fp16_cuda_t;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								typedef void (*ggml_cuda_op_mul_mat_t)(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const int64_t src1_padded_row_size, cudaStream_t stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								typedef void (*ggml_cuda_op_flatten_t)(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 								typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 								typedef void (*load_tiles_cuda_t)(
 								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								typedef float (*vec_dot_q_mul_mat_cuda_t)(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								#define WARP_SIZE 32
-												CUDA: Fixed 7b q3_K_S with mul_mat_vec_q (#2313)


											
										
										
											2023-07-22 21:27:34 +02:00
+								#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								#define CUDA_GELU_BLOCK_SIZE 256
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								#define CUDA_SILU_BLOCK_SIZE 256
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								#define CUDA_TANH_BLOCK_SIZE 256
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								#define CUDA_RELU_BLOCK_SIZE 256
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
 								#define CUDA_HARDSWISH_BLOCK_SIZE 256
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								#define CUDA_SQR_BLOCK_SIZE 256
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								#define CUDA_CPY_BLOCK_SIZE 32
 								#define CUDA_SCALE_BLOCK_SIZE 256
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								#define CUDA_CLAMP_BLOCK_SIZE 256
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								#define CUDA_ROPE_BLOCK_SIZE 256
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								#define CUDA_ALIBI_BLOCK_SIZE 32
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								#define CUDA_QUANTIZE_BLOCK_SIZE 256
-												cuda : deduplicated dequantization code (#1453)


											
										
										
											2023-05-14 20:53:23 +02:00
+								#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								#define CUDA_GET_ROWS_BLOCK_SIZE 256
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								#define CUDA_UPSCALE_BLOCK_SIZE 256
 								#define CUDA_CONCAT_BLOCK_SIZE 256
 								#define CUDA_PAD_BLOCK_SIZE 256
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								#define CUDA_ARANGE_BLOCK_SIZE 256
 								#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								#define CUDA_ACC_BLOCK_SIZE 256
 								#define CUDA_IM2COL_BLOCK_SIZE 256
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								#define CUDA_POOL2D_BLOCK_SIZE 256
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								#define CUDA_Q8_0_NE_ALIGN 2048
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								// dmmv = dequantize_mul_mat_vec
 								#ifndef GGML_CUDA_DMMV_X
 								#define GGML_CUDA_DMMV_X 32
 								#endif
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								#ifndef GGML_CUDA_MMV_Y
 								#define GGML_CUDA_MMV_Y 1
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								#endif
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								#ifndef K_QUANTS_PER_ITERATION
 								#define K_QUANTS_PER_ITERATION 2
 								#else
 								static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
 								#endif
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
+								#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
 								#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
 								#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								#define MUL_MAT_SRC1_COL_STRIDE 128
 								#define MAX_STREAMS 8
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								struct ggml_tensor_extra_gpu {
 								    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								};
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								// this is faster on Windows
 								// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_set_device(const int device) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    int current_device;
 								    CUDA_CHECK(cudaGetDevice(&current_device));
 								    if (device == current_device) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        return;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    CUDA_CHECK(cudaSetDevice(device));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								static int g_device_count = -1;
 								static int g_main_device = 0;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								struct cuda_device_capabilities {
 								    int     cc;                 // compute capability
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    size_t  smpb;               // max. shared memory per block
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    bool    vmm;                // virtual memory support
 								    size_t  vmm_granularity;    // granularity of virtual memory
 								};
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-												cuda : replace asserts in wrong architecture checks with __trap (#4556)

* cuda : replace asserts in wrong architecture checks with __trap

* make bad_arch noreturn, remove returns
											
										
										
											2023-12-21 18:02:30 +01:00
+								[[noreturn]]
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								static __device__ void no_device_code(
 								    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
 								           file_name, line, function_name, arch);
 								    (void) arch_list;
 								#else
 								    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
 								           file_name, line, function_name, arch, arch_list);
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												cuda : replace asserts in wrong architecture checks with __trap (#4556)

* cuda : replace asserts in wrong architecture checks with __trap

* make bad_arch noreturn, remove returns
											
										
										
											2023-12-21 18:02:30 +01:00
+								    __trap();
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    (void) no_device_code; // suppress unused function warning
-												cuda : replace asserts in wrong architecture checks with __trap (#4556)

* cuda : replace asserts in wrong architecture checks with __trap

* make bad_arch noreturn, remove returns
											
										
										
											2023-12-21 18:02:30 +01:00
+								}
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								#ifdef __CUDA_ARCH__
 								#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 								#else
 								#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
 								#endif // __CUDA_ARCH__
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								static __device__ __forceinline__ float warp_reduce_sum(float x) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
 								    }
 								    return x;
 								}
 								static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
 								        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
 								    }
 								    return a;
 								}
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								#ifdef GGML_CUDA_F16
 								static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 								#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 								#pragma unroll
 								   for (int mask = 16; mask > 0; mask >>= 1) {
 								       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
 								   }
 								   return a;
 								#else
 								   (void) a;
 								   NO_DEVICE_CODE;
 								#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 								}
 								#endif // GGML_CUDA_F16
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								static __device__ __forceinline__ float warp_reduce_max(float x) {
 								#pragma unroll
 								    for (int mask = 16; mask > 0; mask >>= 1) {
 								        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
 								    }
 								    return x;
 								}
-												ci : enable -Werror for CUDA builds (#5579)

* cmake : pass -Werror through -Xcompiler

ggml-ci

* make, cmake : enable CUDA errors on warnings

ggml-ci
											
										
										
											2024-02-19 13:45:41 +01:00
+								//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 								//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 								//#pragma unroll
 								//    for (int mask = 16; mask > 0; mask >>= 1) {
 								//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
 								//    }
 								//    return x;
 								//#else
 								//    (void) x;
 								//    NO_DEVICE_CODE;
 								//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 								//}
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __device__ __forceinline__ float op_repeat(const float a, const float b) {
 								    return b;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    GGML_UNUSED(a);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __device__ __forceinline__ float op_add(const float a, const float b) {
 								    return a + b;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __device__ __forceinline__ float op_mul(const float a, const float b) {
 								    return a * b;
 								}
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __device__ __forceinline__ float op_div(const float a, const float b) {
 								    return a / b;
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
 								static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
 								        int ne0, int ne1, int ne2, int ne3,
 								        int ne10, int ne11, int ne12, int ne13,
 								        /*int s0, */ int s1,  int s2,  int s3,
 								        /*int s10,*/ int s11, int s12, int s13) {
 								    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
 								    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
 								    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 12:49:04 +01:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 12:49:04 +01:00
+								        return;
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    const int i11 = i1 % ne11;
 								    const int i12 = i2 % ne12;
 								    const int i13 = i3 % ne13;
 								    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
 								    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
 								    const size_t i_dst  = i_src0;
 								    const src0_t * src0_row = src0 + i_src0;
 								    const src1_t * src1_row = src1 + i_src1;
 								    dst_t * dst_row = dst + i_dst;
 								    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
 								        const int i10 = i0 % ne10;
 								        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 								    }
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 12:49:04 +01:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
 								static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
 								        int ne0, int ne1, int ne2, int ne3,
 								        int ne10, int ne11, int ne12, int ne13,
 								        /*int s0, */ int s1,  int s2,  int s3,
 								        /*int s10,*/ int s11, int s12, int s13) {
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const int i3 = i/(ne2*ne1*ne0);
 								    const int i2 = (i/(ne1*ne0)) % ne2;
 								    const int i1 = (i/ne0) % ne1;
 								    const int i0 = i % ne0;
 								    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								        return;
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    const int i11 = i1 % ne11;
 								    const int i12 = i2 % ne12;
 								    const int i13 = i3 % ne13;
 								    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
 								    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
 								    const size_t i_dst  = i_src0;
 								    const src0_t * src0_row = src0 + i_src0;
 								    const src1_t * src1_row = src1 + i_src1;
 								    dst_t * dst_row = dst + i_dst;
 								    const int i10 = i0 % ne10;
 								    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
 								    const int ne10, const int ne11, const int ne12,
 								    const int nb1, const int nb2, int offset) {
 								    const int i = blockDim.x * blockIdx.x + threadIdx.x;
 								    if (i >= ne) {
 								        return;
 								    }
 								    int src1_idx = i - offset;
 								    int oz = src1_idx / nb2;
 								    int oy = (src1_idx - (oz * nb2)) / nb1;
 								    int ox = src1_idx % nb1;
 								    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
 								        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
 								    } else {
 								        dst[i] = x[i];
 								    }
 								}
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								static __global__ void gelu_f32(const float * x, float * dst, const int k) {
-												Fix compile error on Windows CUDA (#2207)


											
										
										
											2023-07-13 15:58:09 +02:00
+								    const float GELU_COEF_A    = 0.044715f;
 								    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    float xi = x[i];
 								    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								static __global__ void silu_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] / (1.0f + expf(-x[i]));
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const float GELU_QUICK_COEF = -1.702f;
 								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static __global__ void tanh_f32(const float * x, float * dst, int k) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = tanhf(x[i]);
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static __global__ void relu_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fmaxf(x[i], 0);
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 								}
 								static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static __global__ void sqr_f32(const float * x, float * dst, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] * x[i];
 								}
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								template <int block_size>
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    const int tid = threadIdx.x;
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    float2 mean_var = make_float2(0.f, 0.f);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    for (int col = tid; col < ncols; col += block_size) {
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								        const float xi = x[row*ncols + col];
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								        mean_var.x += xi;
 								        mean_var.y += xi * xi;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    }
 								    // sum up partial sums
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    mean_var = warp_reduce_sum(mean_var);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float2 s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = mean_var;
 								        }
 								        __syncthreads();
 								        mean_var = s_sum[lane_id];
 								        mean_var = warp_reduce_sum(mean_var);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    }
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    const float mean = mean_var.x / ncols;
 								    const float var = mean_var.y / ncols - mean * mean;
 								    const float inv_std = rsqrtf(var + eps);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    for (int col = tid; col < ncols; col += block_size) {
 								        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    }
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    // operation
 								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
 								    if (blockIdx.z < ne02) { // src0
 								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne0 +
 								            blockIdx.z * ne0 * gridDim.y;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        dst[offset_dst] = x[offset_src];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    } else {
 								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne0 +
 								            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        dst[offset_dst] = y[offset_src];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    }
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
 								    // blockIdx.z: idx of ne02*ne03
 								    // blockIdx.y: idx of ne01*scale_factor， aka ne1
 								    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
 								    // ne00xne01: ne00 * ne01
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int ne0 = ne00 * scale_factor;
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    // operation
 								    int i00 = nidx / scale_factor;
 								    int i01 = blockIdx.y / scale_factor;
 								    int offset_src =
 								        i00 +
 								        i01 * ne00 +
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        blockIdx.z * ne00xne01;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
 								    dst[offset_dst] = x[offset_src];
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
 								    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
 								    // blockIdx.y: idx of ne1
 								    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    // operation
 								    int offset_dst =
 								        nidx +
 								        blockIdx.y * ne0 +
 								        blockIdx.z * ne0 * gridDim.y;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        int offset_src =
 								            nidx +
 								            blockIdx.y * ne00 +
 								            blockIdx.z * ne00 * ne01;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        dst[offset_dst] = x[offset_src];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    } else {
 								        dst[offset_dst] = 0.0f;
 								    }
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
 								    // blockIDx.x: idx of ne0 / BLOCK_SIZE
 								    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 								    if (nidx >= ne0) {
 								        return;
 								    }
 								    dst[nidx] = start + step * nidx;
 								}
 								static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
 								    // blockIDx.y: idx of timesteps->ne[0]
 								    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
 								    int i = blockIdx.y;
 								    int j = threadIdx.x + blockIdx.x * blockDim.x;
 								    float * embed_data = (float *)((char *)dst +  i*nb1);
 								    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
 								        embed_data[dim] = 0.f;
 								    }
 								    int half = dim / 2;
 								    if (j >= half) {
 								        return;
 								    }
 								    float timestep = timesteps[i];
 								    float freq = (float)expf(-logf(max_period) * j / half);
 								    float arg = timestep * freq;
 								    embed_data[j] = cosf(arg);
 								    embed_data[j + half] = sinf(arg);
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								template <int block_size>
 								static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    // blockIdx.x: num_groups idx
 								    // threadIdx.x: block_size idx
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int start = blockIdx.x * group_size;
 								    int end = start + group_size;
 								    start += threadIdx.x;
 								    if (end >= ne_elements) {
 								        end = ne_elements;
 								    }
 								    float tmp = 0.0f; // partial sum for thread in warp
 								    for (int j = start; j < end; j += block_size) {
 								        tmp += x[j];
 								    }
 								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
 								        }
 								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
 								    }
 								    float mean = tmp / group_size;
 								    tmp = 0.0f;
 								    for (int j = start; j < end; j += block_size) {
 								        float xi = x[j] - mean;
 								        dst[j] = xi;
 								        tmp += xi * xi;
 								    }
 								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
 								        }
 								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
 								    }
 								    float variance = tmp / group_size;
 								    float scale = rsqrtf(variance + eps);
 								    for (int j = start; j < end; j += block_size) {
 								        dst[j] *= scale;
 								    }
 								}
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								template <int block_size>
-												make rms_norm_eps a parameter (#2374)

* make rms_norm_eps a parameter

* add rms_norm_eps to command line

* fix baby llama, test-grad0

* use scientific notation for eps param in the help

ggml-ci
											
										
										
											2023-07-24 17:57:12 +02:00
+								static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
 								    const int tid = threadIdx.x;
 								    float tmp = 0.0f; // partial sum for thread in warp
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    for (int col = tid; col < ncols; col += block_size) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        const float xi = x[row*ncols + col];
 								        tmp += xi * xi;
 								    }
 								    // sum up partial sums
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
 								        __shared__ float s_sum[32];
 								        int warp_id = threadIdx.x / WARP_SIZE;
 								        int lane_id = threadIdx.x % WARP_SIZE;
 								        if (lane_id == 0) {
 								            s_sum[warp_id] = tmp;
 								        }
 								        __syncthreads();
 								        tmp = s_sum[lane_id];
 								        tmp = warp_reduce_sum(tmp);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    }
 								    const float mean = tmp / ncols;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    const float scale = rsqrtf(mean + eps);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    for (int col = tid; col < ncols; col += block_size) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        dst[row*ncols + col] = scale * x[row*ncols + col];
 								    }
 								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const block_q4_0 * x = (const block_q4_0 *) vx;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const dfloat d = x[ib].d;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const int vui = x[ib].qs[iqs];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v.x = vui & 0xF;
 								    v.y = vui >> 4;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v = __hsub2(v, {8.0f, 8.0f});
 								    v = __hmul2(v, {d, d});
 								#else
 								    v.x = (v.x - 8.0f) * d;
 								    v.y = (v.y - 8.0f) * d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const block_q4_1 * x = (const block_q4_1 *) vx;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const dfloat d = __low2half(x[ib].dm);
 								    const dfloat m = __high2half(x[ib].dm);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const int vui = x[ib].qs[iqs];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v.x = vui & 0xF;
 								    v.y = vui >> 4;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v = __hmul2(v, {d, d});
 								    v = __hadd2(v, {m, m});
 								#else
 								    v.x = (v.x * d) + m;
 								    v.y = (v.y * d) + m;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const block_q5_0 * x = (const block_q5_0 *) vx;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const dfloat d = x[ib].d;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
 								    uint32_t qh;
 								    memcpy(&qh, x[ib].qh, sizeof(qh));
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
 								    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
 								    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v = __hsub2(v, {16.0f, 16.0f});
 								    v = __hmul2(v, {d, d});
 								#else
 								    v.x = (v.x - 16.0f) * d;
 								    v.y = (v.y - 16.0f) * d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const block_q5_1 * x = (const block_q5_1 *) vx;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const dfloat d = __low2half(x[ib].dm);
 								    const dfloat m = __high2half(x[ib].dm);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
 								    uint32_t qh;
 								    memcpy(&qh, x[ib].qh, sizeof(qh));
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
 								    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
 								    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v = __hmul2(v, {d, d});
 								    v = __hadd2(v, {m, m});
 								#else
 								    v.x = (v.x * d) + m;
 								    v.y = (v.y * d) + m;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const block_q8_0 * x = (const block_q8_0 *) vx;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    const dfloat d = x[ib].d;
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v.x = x[ib].qs[iqs + 0];
 								    v.y = x[ib].qs[iqs + 1];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    v = __hmul2(v, {d, d});
 								#else
 								    v.x *= d;
 								    v.y *= d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 06:48:06 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
 								    const int i = blockIdx.x;
 								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int ib = 8*i + ir;
 								    if (ib >= nb32) {
 								        return;
 								    }
 								    dst_t * y = yy + 256*i + 32*ir + 4*il;
 								    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
 								    const float d = __half2float(x->d);
 								    const float dm = -8*d;
 								    const uint8_t * q = x->qs + 4*il;
 								    for (int l = 0; l < 4; ++l) {
 								        y[l+ 0] = d * (q[l] & 0xF) + dm;
 								        y[l+16] = d * (q[l] >>  4) + dm;
 								    }
 								}
 								template<typename dst_t>
 								static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
 								    const int i = blockIdx.x;
 								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int ib = 8*i + ir;
 								    if (ib >= nb32) {
 								        return;
 								    }
 								    dst_t * y = yy + 256*i + 32*ir + 4*il;
 								    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
 								    const float2 d = __half22float2(x->dm);
 								    const uint8_t * q = x->qs + 4*il;
 								    for (int l = 0; l < 4; ++l) {
 								        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
 								        y[l+16] = d.x * (q[l] >>  4) + d.y;
 								    }
 								}
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								//================================== k-quants
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const int i   = blockIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    const block_q2_K * x = (const block_q2_K *) vx;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int tid = threadIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int n   = tid/32;
 								    const int l   = tid - 32*n;
 								    const int is  = 8*n + l/16;
 								    const uint8_t q = x[i].qs[32*n + l];
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 128*n;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    float dall = __low2half(x[i].dm);
 								    float dmin = __high2half(x[i].dm);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
 								    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
 								    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
 								    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int is = tid/16;  // 0 or 1
 								    const int il = tid%16;  // 0...15
 								    const uint8_t q = x[i].qs[il] >> (2*is);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 16*is + il;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    float dall = __low2half(x[i].dm);
 								    float dmin = __high2half(x[i].dm);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
 								    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    const int i = blockIdx.x;
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    const block_q3_K * x = (const block_q3_K *) vx;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
 								    const int r = threadIdx.x/4;
 								    const int tid = r/2;
 								    const int is0 = r%2;
 								    const int l0 = 16*is0 + 4*(threadIdx.x%4);
 								    const int n = tid / 4;
 								    const int j = tid - 4*n;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    uint8_t m = 1 << (4*n + j);
 								    int is = 8*n + 2*j + is0;
 								    int shift = 2*j;
 								    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
 								                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
 								                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
 								                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
 								    float d_all = x[i].d;
 								    float dl = d_all * (us - 32);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const uint8_t * q = x[i].qs + 32*n;
 								    const uint8_t * hm = x[i].hmask;
 								    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x;
 								    const int is  = tid/16;  // 0 or 1
 								    const int il  = tid%16;  // 0...15
 								    const int im  = il/8;    // 0...1
 								    const int in  = il%8;    // 0...7
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 16*is + il;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
 								    const uint8_t q = x[i].qs[il] >> (2*is);
 								    const uint8_t h = x[i].hmask[in] >> (2*is + im);
 								    const float   d = (float)x[i].d;
 								    if (is == 0) {
 								        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
 								        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
 								    } else {
 								        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
 								        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
 								    }
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
 								    if (j < 4) {
 								        d = q[j] & 63; m = q[j + 4] & 63;
 								    } else {
 								        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
 								        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
 								    }
 								}
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    const block_q4_K * x = (const block_q4_K *) vx;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const int i = blockIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int il  = tid/8;
 								    const int ir  = tid%8;
 								    const int is  = 2*il;
 								    const int n   = 4;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const float dall = __low2half(x[i].dm);
 								    const float dmin = __high2half(x[i].dm);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const uint8_t * q = x[i].qs + 32*il + n*ir;
 								    uint8_t sc, m;
 								    get_scale_min_k4(is + 0, x[i].scales, sc, m);
 								    const float d1 = dall * sc; const float m1 = dmin * m;
 								    get_scale_min_k4(is + 1, x[i].scales, sc, m);
 								    const float d2 = dall * sc; const float m2 = dmin * m;
 								    for (int l = 0; l < n; ++l) {
 								        y[l + 0] = d1 * (q[l] & 0xF) - m1;
 								        y[l +32] = d2 * (q[l] >>  4) - m2;
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x;
 								    const uint8_t * q = x[i].qs;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								    const float d = (float)x[i].dm[0];
 								    const float m = (float)x[i].dm[1];
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
 								    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    const block_q5_K * x = (const block_q5_K *) vx;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const int i = blockIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    // assume 64 threads - this is very slightly better than the one below
 								    const int tid = threadIdx.x;
 								    const int il  = tid/16;   // il is in 0...3
 								    const int ir  = tid%16;   // ir is in 0...15
 								    const int is  = 2*il;     // is is in 0...6
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const float dall = __low2half(x[i].dm);
 								    const float dmin = __high2half(x[i].dm);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
 								    const uint8_t * qh = x[i].qh + 2*ir;
 								    uint8_t sc, m;
 								    get_scale_min_k4(is + 0, x[i].scales, sc, m);
 								    const float d1 = dall * sc; const float m1 = dmin * m;
 								    get_scale_min_k4(is + 1, x[i].scales, sc, m);
 								    const float d2 = dall * sc; const float m2 = dmin * m;
 								    uint8_t   hm  = 1 << (2*il);
 								    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
 								    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
 								    hm <<= 1;
 								    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
 								    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x;
 								    const uint8_t q = x[i].qs[tid];
 								    const int im = tid/8;  // 0...3
 								    const int in = tid%8;  // 0...7
 								    const int is = tid/16; // 0 or 1
 								    const uint8_t h = x[i].qh[in] >> im;
 								    const float d = x[i].d;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + tid;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
 								    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    const block_q6_K * x = (const block_q6_K *) vx;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const int i = blockIdx.x;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    // assume 64 threads - this is very slightly better than the one below
 								    const int tid = threadIdx.x;
 								    const int ip  = tid/32;   // ip is 0 or 1
 								    const int il  = tid - 32*ip; // 0...32
 								    const int is  = 8*ip + il/16;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 128*ip + il;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
 								    const float d = x[i].d;
 								    const uint8_t * ql = x[i].ql + 64*ip + il;
 								    const uint8_t   qh = x[i].qh[32*ip + il];
 								    const int8_t  * sc = x[i].scales + is;
 								    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
 								    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
 								    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
 								    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    // assume 32 threads
 								    const int tid = threadIdx.x;
 								    const int ip  = tid/16;         // 0 or 1
 								    const int il  = tid - 16*ip;    // 0...15
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    dst_t * y = yy + i*QK_K + 16*ip + il;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
 								    const float d = x[i].d;
 								    const uint8_t   ql = x[i].ql[16*ip + il];
 								    const uint8_t   qh = x[i].qh[il] >> (2*ip);
 								    const int8_t  * sc = x[i].scales;
 								    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
 								    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
 								    switch (type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
 								            return true;
 								        default:
 								            return false;
 								    }
 								}
 								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint16_t * q2 = x[i].qs + 4*ib;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								    const uint32_t aux32 = q2[2] | (q2[3] << 16);
 								    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
 								    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 								#else
 								    assert(false);
 								#endif
 								}
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq2_xs * x = (const block_iq2_xs *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint16_t * q2 = x[i].qs + 4*ib;
 								    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
 								    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
 								    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 								#else
 								    assert(false);
 								#endif
 								}
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq2_s * x = (const block_iq2_s *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
 								    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
 								    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
 								    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 								#else
 								    assert(false);
 								#endif
 								}
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t  * q3 = x[i].qs + 8*ib;
 								    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
 								    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
 								    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
 								    const uint32_t aux32 = gas[0] | (gas[1] << 16);
 								    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
 								    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
 								        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
 								    }
 								#else
 								    assert(false);
 								#endif
 								}
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq3_s * x = (const block_iq3_s *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
 								    const uint8_t * qs = x[i].qs + 8*ib;
-												ggml : IQ3_S improvements (#5829)

* iq3_s: somewhat faster AVX2 dot product

On Ryzen a 7950X TG-128 increases to 16 t/s from 15.5 t/s using
16 threads. For 8 threads it is 13.85 t/s vs 11.75 t/s.
PP-512 increases to 28.5 t/s from 23.8 t/s.

* iq3_s: somewhat faster ARM_NEON dot product

Still dog slow - 10.7 t/s up from 9.9 t/s.

* iq3_s: another small ARM_NEON improvement

10.7 -> 11.0 t/s. Using vmulq_s8 is faster than the xor - sub trick
that works best on AVX2.

* iq3_s: minor improvement on Metal

49.4 t/s -> 50.3 t/s

* iq3_s: PPL improvement

E.g., for a context of 4096 LLaMA-v2-7B goes to 5.1340 from 5.1653.

* iq3_s: use new grid everywhere

* Fix ARM_NEON

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-02 16:00:51 +01:00
+								    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
 								    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
 								    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								    const uint8_t signs = x[i].signs[4*ib + il];
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
 								        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
 								    }
 								#else
 								    assert(false);
 								#endif
 								}
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq1_s * x = (const block_iq1_s  *) vx;
 								    const int tid = threadIdx.x;
 								#if QK_K == 256
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
 								    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
 								    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
 								    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
 								    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
 								    grid32[0] &= 0x0f0f0f0f;
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								    for (int j = 0; j < 8; ++j) {
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								        y[j] = d * (q[j] + delta);
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								    }
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								#else
 								    assert(false);
 								#endif
 								}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 								template<typename dst_t>
 								static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
 								    const int tid = threadIdx.x;
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
 								    const uint8_t  * q4 = x[ib].qs + 4*il;
 								    const float d = (float)x[ib].d;
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
 								        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
 								    }
 								}
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-28 09:37:02 +01:00
+								#if QK_K != 64
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								template<typename dst_t>
 								static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 								    const int i   = blockIdx.x;
 								    const block_iq4_xs * x = (const block_iq4_xs *)vx;
 								    const int tid = threadIdx.x;
 								    const int il = tid/8; // 0...3
 								    const int ib = tid%8; // 0...7
 								    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
 								    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
 								    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
 								    for (int j = 0; j < 4; ++j) {
 								        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
 								        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
 								    }
 								}
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-28 09:37:02 +01:00
+								#endif
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    if (row > nrows) return;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    float tmp = 0; // partial sum for thread in warp
 								#if QK_K == 256
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    const int step = 16/K_QUANTS_PER_ITERATION;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0...15 or 0...7
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int q_offset = 32*im + l0;
 								    const int s_offset = 8*im;
 								    const int y_offset = 128*im + l0;
 								    uint32_t aux[4];
 								    const uint8_t * d = (const uint8_t *)aux;
 								    const uint8_t * m = (const uint8_t *)(aux + 2);
 								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
 								        const float   * y = yy + i * QK_K + y_offset;
 								        const uint8_t * q = x[i].qs + q_offset;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
 								        aux[0] = a[0] & 0x0f0f0f0f;
 								        aux[1] = a[1] & 0x0f0f0f0f;
 								        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
 								        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
 								        float sum1 = 0, sum2 = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
 								                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
 								                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
 								                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
 								                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
 								                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
 								                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
 								                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
 								            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
 								                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        }
 								        tmp += dall * sum1 - dmin * sum2;
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
 								    const int offset = tid * K_QUANTS_PER_ITERATION;
 								    uint32_t uaux[2];
 								    const uint8_t * d = (const uint8_t *)uaux;
 								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const float   * y = yy + i * QK_K + offset;
 								        const uint8_t * q = x[i].qs + offset;
 								        const uint32_t * s = (const uint32_t *)x[i].scales;
 								        uaux[0] = s[0] & 0x0f0f0f0f;
 								        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        const float2 dall = __half22float2(x[i].dm);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
 								        float sum1 = 0, sum2 = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            const uint8_t ql = q[l];
 								            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
 								                  + y[l+16] * d[1] * ((ql >> 2) & 3)
 								                  + y[l+32] * d[2] * ((ql >> 4) & 3)
 								                  + y[l+48] * d[3] * ((ql >> 6) & 3);
 								            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
 								        }
 								        tmp += dall.x * sum1 - dall.y * sum2;
 								    }
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    if (threadIdx.x == 0) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        dst[row] = tmp;
 								    }
 								}
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    if (row > nrows) return;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
 								    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    float tmp = 0; // partial sum for thread in warp
 								#if QK_K == 256
 								    const uint16_t kmask1 = 0x0303;
 								    const uint16_t kmask2 = 0x0f0f;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
 								    const int step = 16/K_QUANTS_PER_ITERATION;
 								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0....15 or 0...7
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    const uint8_t m = 1 << (4*im);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int q_offset =  32*im + l0;
 								    const int y_offset = 128*im + l0;
 								    uint16_t utmp[4];
 								    const int8_t * s = (const int8_t *)utmp;
 								    const uint16_t s_shift = 4*im;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								        const float   * y  = yy + i * QK_K + y_offset;
 								        const uint8_t * q = x[i].qs + q_offset;
 								        const uint8_t * h = x[i].hmask + l0;
 								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
 								        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
 								        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
 								        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
 								        const float d = x[i].d;
 								        float sum = 0;
 								        for (int l = 0; l < n; ++l) {
 								            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
 								                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
 								                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
 								                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
 								            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
 								                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
 								                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
 								                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
 								        }
 								        tmp += d * sum;
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
 								    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
 								    const int in = offset/8;                                 // 0 or 1
 								    const int im = offset%8;                                 // 0...7
 								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const float   * y = yy + i * QK_K + offset;
 								        const uint8_t * q = x[i].qs + offset;
 								        const uint8_t * s = x[i].scales;
 								        const float dall = (float)x[i].d;
 								        float sum = 0;
 								        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
 								            const uint8_t hl = x[i].hmask[im+l] >> in;
 								            const uint8_t ql = q[l];
 								            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
 								                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
 								                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
 								                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
 								        }
 								        tmp += sum;
 								    }
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    if (threadIdx.x == 0) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        dst[row] = tmp;
 								    }
 								}
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    if (row > nrows) return;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    const block_q4_K * x = (const block_q4_K *)vx + ib0;
 								#if QK_K == 256
 								    const uint16_t kmask1 = 0x3f3f;
 								    const uint16_t kmask2 = 0x0f0f;
 								    const uint16_t kmask3 = 0xc0c0;
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
 								    const int il  = tid/step;                            // 0...3
 								    const int ir  = tid - step*il;                       // 0...7 or 0...3
 								    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
 								    const int in = il%2;
 								    const int l0 = n*(2*ir + in);
 								    const int q_offset = 32*im + l0;
 								    const int y_offset = 64*im + l0;
 								    uint16_t aux[4];
 								    const uint8_t * sc = (const uint8_t *)aux;
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								#if K_QUANTS_PER_ITERATION == 2
 								    uint32_t q32[4];
 								    const uint8_t * q4 = (const uint8_t *)q32;
 								#else
 								    uint16_t q16[4];
 								    const uint8_t * q4 = (const uint8_t *)q16;
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    float tmp = 0; // partial sum for thread in warp
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								        const float   * y1 = yy + i*QK_K + y_offset;
 								        const float   * y2 = y1 + 128;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux[0] = a[im+0] & kmask1;
 								        aux[1] = a[im+2] & kmask1;
 								        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
 								        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								#if K_QUANTS_PER_ITERATION == 2
 								        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
 								        const uint32_t * q2 = q1 + 16;
 								        q32[0] = q1[0] & 0x0f0f0f0f;
 								        q32[1] = q1[0] & 0xf0f0f0f0;
 								        q32[2] = q2[0] & 0x0f0f0f0f;
 								        q32[3] = q2[0] & 0xf0f0f0f0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        float4 s = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								        for (int l = 0; l < 4; ++l) {
 								            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
 								            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
 								        }
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 								#else
 								        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
 								        const uint16_t * q2 = q1 + 32;
 								        q16[0] = q1[0] & 0x0f0f;
 								        q16[1] = q1[0] & 0xf0f0;
 								        q16[2] = q2[0] & 0x0f0f;
 								        q16[3] = q2[0] & 0xf0f0;
 								        float4 s = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
 								        for (int l = 0; l < 2; ++l) {
 								            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
 								            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
 								            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
 								        }
 								        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
 								    const int step = tid * K_QUANTS_PER_ITERATION;
 								    uint16_t aux16[2];
 								    const uint8_t * s = (const uint8_t *)aux16;
 								    float tmp = 0;
 								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const uint8_t * q = x[i].qs + step;
 								        const float   * y = yy + i*QK_K + step;
 								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux16[0] = a[0] & 0x0f0f;
 								        aux16[1] = (a[0] >> 4) & 0x0f0f;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								        const float d = (float)x[i].dm[0];
 								        const float m = (float)x[i].dm[1];
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								        float sum = 0.f;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
 								                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
 								                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
 								                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
 								        }
 								        tmp += sum;
 								    }
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    if (tid == 0) {
 								        dst[row] = tmp;
 								    }
 								}
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    const int row = blockIdx.x;
 								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    const block_q5_K * x = (const block_q5_K *)vx + ib0;
 								    float tmp = 0; // partial sum for thread in warp
 								#if QK_K == 256
 								    const uint16_t kmask1 = 0x3f3f;
 								    const uint16_t kmask2 = 0x0f0f;
 								    const uint16_t kmask3 = 0xc0c0;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int tid = threadIdx.x/2;  // 0...15
 								    const int ix  = threadIdx.x%2;
 								    const int il  = tid/4;     // 0...3
 								    const int ir  = tid - 4*il;// 0...3
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int n   = 2;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
 								    const int in = il%2;
 								    const int l0 = n*(2*ir + in);
 								    const int q_offset = 32*im + l0;
 								    const int y_offset = 64*im + l0;
 								    const uint8_t hm1  = 1 << (2*im);
 								    const uint8_t hm2  = hm1 << 4;
 								    uint16_t aux[4];
 								    const uint8_t * sc = (const uint8_t *)aux;
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								    uint16_t q16[8];
 								    const uint8_t * q4 = (const uint8_t *)q16;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    for (int i = ix; i < num_blocks_per_row; i += 2) {
 								        const uint8_t * ql1 = x[i].qs + q_offset;
 								        const uint8_t * qh  = x[i].qh + l0;
 								        const float   * y1  = yy + i*QK_K + y_offset;
 								        const float   * y2  = y1 + 128;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								        const float dall = __low2half(x[i].dm);
 								        const float dmin = __high2half(x[i].dm);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								        const uint16_t * a = (const uint16_t *)x[i].scales;
 								        aux[0] = a[im+0] & kmask1;
 								        aux[1] = a[im+2] & kmask1;
 								        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
 								        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
 								        float4 sum = {0.f, 0.f, 0.f, 0.f};
 								        float smin = 0;
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								        const uint16_t * q1 = (const uint16_t *)ql1;
 								        const uint16_t * q2 = q1 + 32;
 								        q16[0] = q1[0] & 0x0f0f;
 								        q16[1] = q1[8] & 0x0f0f;
 								        q16[2] = (q1[0] >> 4) & 0x0f0f;
 								        q16[3] = (q1[8] >> 4) & 0x0f0f;
 								        q16[4] = q2[0] & 0x0f0f;
 								        q16[5] = q2[8] & 0x0f0f;
 								        q16[6] = (q2[0] >> 4) & 0x0f0f;
 								        q16[7] = (q2[8] >> 4) & 0x0f0f;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        for (int l = 0; l < n; ++l) {
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
 								                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
 								            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
 								                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
 								            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
 								                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
 								            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
 								                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
 								                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        }
 								        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    }
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
 								    const int step = tid * K_QUANTS_PER_ITERATION;
 								    const int im = step/8;
 								    const int in = step%8;
 								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const uint8_t * q = x[i].qs + step;
 								        const int8_t  * s = x[i].scales;
 								        const float   * y = yy + i*QK_K + step;
 								        const float     d = x[i].d;
 								        float sum = 0.f;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            const uint8_t h = x[i].qh[in+j] >> im;
 								            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
 								                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
 								                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
 								                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
 								        }
 								        tmp += sum;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								    if (threadIdx.x == 0) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								        dst[row] = tmp;
 								    }
 								}
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    if (row > nrows) return;
 								    const int num_blocks_per_row = ncols / QK_K;
 								    const int ib0 = row*num_blocks_per_row;
 								    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
 								    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
 								    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
 								    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
 								    const int in = tid - step*im;                        // 0...15 or 0...7
 								#if K_QUANTS_PER_ITERATION == 1
 								    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
 								    const int is = 0;
 								#else
 								    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
 								    const int is = in / 4;
 								#endif
 								    const int ql_offset = 64*im + l0;
 								    const int qh_offset = 32*im + l0;
 								    const int s_offset  =  8*im + is;
 								    const int y_offset = 128*im + l0;
 								    float tmp = 0; // partial sum for thread in warp
 								    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
 								        const float   * y  = yy + i * QK_K + y_offset;
 								        const uint8_t * ql = x[i].ql + ql_offset;
 								        const uint8_t * qh = x[i].qh + qh_offset;
 								        const int8_t  * s  = x[i].scales + s_offset;
 								        const float d = x[i].d;
 								#if K_QUANTS_PER_ITERATION == 1
 								        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
 								                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
 								                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
 								                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
 								                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
 								                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
 								                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
 								                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
 								        tmp += sum;
 								#else
 								        float sum = 0;
 								        for (int l = 0; l < 4; ++l) {
 								            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
 								                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
 								                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
 								                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
 								        }
 								        tmp += sum;
 								#endif
 								    }
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
 								    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
 								    const int step = tid * K_QUANTS_PER_ITERATION;
 								    float tmp = 0; // partial sum for thread in warp
 								    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
 								        const float   * y  = yy + i * QK_K + step;
 								        const uint8_t * ql = x[i].ql + step;
 								        const uint8_t * qh = x[i].qh + step;
 								        const int8_t  * s  = x[i].scales;
 								        const float d = x[i+0].d;
 								        float sum = 0;
 								        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
 								            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
 								                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
 								                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
 								                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
 								        }
 								        tmp += sum;
 								    }
 								#endif
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
 								    if (tid == 0) {
 								        dst[row] = tmp;
 								    }
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const half * x = (const half *) vx;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    // automatic half -> float type cast if dfloat == float
 								    v.x = x[ib + iqs + 0];
 								    v.y = x[ib + iqs + 1];
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
 								    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    if (ix >= kx_padded) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								        return;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i_padded = iy*kx_padded + ix;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    block_q8_1 * y = (block_q8_1 *) vy;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int ib = i_padded / QK8_1; // block index
 								    const int iqs = i_padded % QK8_1; // quant index
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    float amax = fabsf(xi);
 								    float sum = xi;
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    amax = warp_reduce_max(amax);
 								    sum = warp_reduce_sum(sum);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
 								    const float d = amax / 127;
 								    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
 								    y[ib].qs[iqs] = q;
 								    if (iqs > 0) {
 								        return;
 								    }
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    reinterpret_cast<half&>(y[ib].ds.x) = d;
 								    reinterpret_cast<half&>(y[ib].ds.y) = sum;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								static __global__ void k_get_rows(
 								            const void * src0, const int32_t * src1, dst_t * dst,
 								            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
 								            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
 								            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
 								            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
 								            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
 								    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
 								    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
 								    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
 								    if (i00 >= ne00) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        return;
 								    }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
 								    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    const int ib = i00/qk; // block index
 								    const int iqs = (i00%qk)/qr; // quant index
 								    const int iybs = i00 - i00%qk; // dst block start index
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    const int y_offset = qr == 1 ? 1 : qk/2;
 								    // dequantize
 								    dfloat2 v;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    dequantize_kernel(src0_row, ib, iqs, v);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    dst_row[iybs + iqs + 0]        = v.x;
 								    dst_row[iybs + iqs + y_offset] = v.y;
 								}
 								template<typename src0_t, typename dst_t>
 								static __global__ void k_get_rows_float(
 								            const src0_t * src0, const int32_t * src1, dst_t * dst,
 								            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
 								            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
 								            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
 								            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
 								            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
 								    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
 								    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
 								    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
 								    if (i00 >= ne00) {
 								        return;
 								    }
 								    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
 								    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
 								    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
 								    dst_row[i00] = src0_row[i00];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 								static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-												ggml : remove bit shuffling (#1405)

* ggml : remove Q4_0 bit shufling (ARM NEON)

* ggml : remove Q4_1 bit shuffling (ARM NEON + reference)

* ggml : nibbles_from_floats() + bytes_from_nibbles() (ARM NEON)

* ggml : remove Q4_2 bit shuffling (WIP, BROKEN)

* ggml : remove Q5_0 bit shuffling (ARM NEON)

* ggml : 2x faster scalar implementations

* ggml : remove Q5_1 bit shuffling (ARM NEON + scalar)

* ggml : simplify scalar dot

* ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit

* ggml : fix Q4_1 quantization

* ggml : update cuBLAS + normalize variable names

* ggml : remove Q4_2 mode

* ggml : minor formatting

* ggml : fix Q5_0 quantization

* scripts : add script for measuring the time per token

* AVX implementations (#1370)

* ggml : uniform 5th bit extraction

* llama : produce error upon loading old model files

* llama : fix model magic/version write

* ggml : speed-up Q5_0 + Q5_1 at 4 threads

* ggml : preserve old Q4 and Q5 formats

* ggml : simplify Q8_1 - no need for low / high sums anymore

* ggml : fix Q8_0 and Q8_1 rounding

* Revert "AVX implementations (#1370)"

This reverts commit 948d124837f9d287d8490f41338e0e4cceb0814f.

* ggml : fix AVX2 implementation

* sha : update hashes for 7B and 13B

* readme : update timings + remove warning banner

* llama : update v2 PR number to 1405

* ggml : fix WASM comments

* ggml : back to original bit order

* readme : add note that Q4 and Q5 have been changed

* llama : fix return for unknown version

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
											
										
										
											2023-05-11 23:23:08 +02:00
-												cuda : deduplicated dequantization code (#1453)


											
										
										
											2023-05-14 20:53:23 +02:00
+								    if (i >= k) {
 								        return;
-												ggml : add Q5_0 and Q5_1 quantization (#1187)

* ggml : add Q5_0 quantization (cuBLAS only)

* ggml : fix Q5_0 qh -> uint32_t

* ggml : fix q5_0 histogram stats

* ggml : q5_0 scalar dot product

* ggml : q5_0 ARM NEON dot

* ggml : q5_0 more efficient ARM NEON using uint64_t masks

* ggml : rename Q5_0 -> Q5_1

* ggml : adding Q5_0 mode

* quantize : add Q5_0 and Q5_1 to map

* ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195)

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
											
										
										
											2023-04-26 22:14:13 +02:00
+								    }
-												cuda : deduplicated dequantization code (#1453)


											
										
										
											2023-05-14 20:53:23 +02:00
+								    const int ib = i/qk; // block index
 								    const int iqs = (i%qk)/qr; // quant index
 								    const int iybs = i - i%qk; // y block start index
 								    const int y_offset = qr == 1 ? 1 : qk/2;
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 22:40:51 +02:00
-												cuda : deduplicated dequantization code (#1453)


											
										
										
											2023-05-14 20:53:23 +02:00
+								    // dequantize
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    dfloat2 v;
 								    dequantize_kernel(vx, ib, iqs, v);
 								    y[iybs + iqs + 0]        = v.x;
 								    y[iybs + iqs + y_offset] = v.y;
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 22:40:51 +02:00
+								}
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								template <typename src_t, typename dst_t>
 								static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    const src_t * x = (src_t *) vx;
 								    y[i] = x[i];
 								}
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								template <bool need_check>
 								static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
 								#if __CUDA_ARCH__ >= CC_PASCAL
 								    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
 								    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
 								    const int * x0 = ((int *) vx) + blockIdx.x * nint;
 								    half2 * y2 = (half2 *) (y + i0);
 								    __shared__ int vals[nint];
 								#pragma unroll
 								    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
 								        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
 								            break;
 								        }
 								        const int ix = ix0 + threadIdx.x;
 								        vals[ix] = x0[ix];
 								    }
 								#pragma unroll
 								    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
 								        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
 								            return;
 								        }
 								        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
 								        const half    d = *b0;
 								        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
 								        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
 								    }
 								#else
 								    (void) vx; (void) y; (void) k;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								#endif // __CUDA_ARCH__ >= CC_PASCAL
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#define VDR_Q4_0_Q8_1_MMVQ 2
 								#define VDR_Q4_0_Q8_1_MMQ  4
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
 								    const int * v, const int * u, const float & d4, const half2 & ds8) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int sumi = 0;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int i = 0; i < vdr; ++i) {
 								        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 								        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(vi0, u[2*i+0], sumi);
 								        sumi = __dp4a(vi1, u[2*i+1], sumi);
 								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 ds8f = __half22float2(ds8);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    // second part effectively subtracts 8 from each quant value
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q4_1_Q8_1_MMVQ 2
 								#define VDR_Q4_1_Q8_1_MMQ  4
 								template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
 								    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
 								#pragma unroll
 								    for (int i = 0; i < vdr; ++i) {
 								        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 								        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(vi0, u[2*i+0], sumi);
 								        sumi = __dp4a(vi1, u[2*i+1], sumi);
 								    }
 								#ifdef GGML_CUDA_F16
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
 								    const float d4d8 = tmp.x;
 								    const float m4s8 = tmp.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 dm4f = __half22float2(dm4);
 								    const float2 ds8f = __half22float2(ds8);
 								    const float d4d8 = dm4f.x * ds8f.x;
 								    const float m4s8 = dm4f.y * ds8f.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // GGML_CUDA_F16
 								    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
 								    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q5_0_Q8_1_MMVQ 2
 								#define VDR_Q5_0_Q8_1_MMQ  4
 								template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
 								    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    for (int i = 0; i < vdr; ++i) {
 								        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 								        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 								        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 								        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 								        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 								        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
 								        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 								        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 								        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 								        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 								        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 								        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
 								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 ds8f = __half22float2(ds8);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    // second part effectively subtracts 16 from each quant value
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q5_1_Q8_1_MMVQ 2
 								#define VDR_Q5_1_Q8_1_MMQ  4
 								template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
 								    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    for (int i = 0; i < vdr; ++i) {
 								        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 								        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 								        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 								        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 								        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 								        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
 								        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 								        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 								        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 								        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 								        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 								        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
 								    }
 								#ifdef GGML_CUDA_F16
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
 								    const float d5d8 = tmp.x;
 								    const float m5s8 = tmp.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 dm5f = __half22float2(dm5);
 								    const float2 ds8f = __half22float2(ds8);
 								    const float d5d8 = dm5f.x * ds8f.x;
 								    const float m5s8 = dm5f.y * ds8f.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // GGML_CUDA_F16
 								    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
 								    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q8_0_Q8_1_MMVQ 2
 								#define VDR_Q8_0_Q8_1_MMQ 8
 								template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int * v, const int * u, const float & d8_0, const float & d8_1) {
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    for (int i = 0; i < vdr; ++i) {
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(v[i], u[i], sumi);
 								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return d8_0*d8_1 * sumi;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
 								    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    for (int i = 0; i < vdr; ++i) {
 								        // SIMD dot product of quantized values
 								        sumi = __dp4a(v[i], u[i], sumi);
 								    }
 								#ifdef GGML_CUDA_F16
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
 								    const float d8d8 = tmp.x;
 								    const float m8s8 = tmp.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#else
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float2 dm8f = __half22float2(dm8);
 								    const float2 ds8f = __half22float2(ds8);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const float d8d8 = dm8f.x * ds8f.x;
 								    const float m8s8 = dm8f.y * ds8f.y;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#endif // GGML_CUDA_F16
 								    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
 								    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#define VDR_Q2_K_Q8_1_MMVQ 1
 								#define VDR_Q2_K_Q8_1_MMQ  2
 								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 								    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const half2 & dm2, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR2_K; ++i) {
 								        const int sc = scales[2*i];
 								        const int vi = (v >> (2*i)) & 0x03030303;
 								        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
 								        // fill int with 4x m
 								        int m = sc >> 4;
 								        m |= m <<  8;
 								        m |= m << 16;
 								        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
 								    }
 								    const float2 dm2f = __half22float2(dm2);
 								    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const half2 & dm2, const float & d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi_d = 0;
 								    int sumi_m = 0;
 								#pragma unroll
 								    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
 								        int sumi_d_sc = 0;
 								        const int sc = scales[i0 / (QI8_1/2)];
 								        // fill int with 4x m
 								        int m = sc >> 4;
 								        m |= m <<  8;
 								        m |= m << 16;
 								#pragma unroll
 								        for (int i = i0; i < i0 + QI8_1/2; ++i) {
 								            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
 								            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
 								        }
 								        sumi_d += sumi_d_sc * (sc & 0xF);
 								    }
 								    const float2 dm2f = __half22float2(dm2);
 								    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q3_K_Q8_1_MMVQ 1
 								#define VDR_Q3_K_Q8_1_MMQ  2
 								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 								    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
 								    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR3_K; ++i) {
 								        const int isc = scale_offset + 2*i;
 								        const int isc_low = isc % (QK_K/32);
 								        const int sc_shift_low = 4 * (isc / (QK_K/32));
 								        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
 								        const int isc_high = isc % (QK_K/64);
 								        const int sc_shift_high = 2 * (isc / (QK_K/64));
 								        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
 								        const int sc = (sc_low | sc_high) - 32;
 								        const int vil = (vl >> (2*i)) & 0x03030303;
 								        const int vih = ((vh >> i) << 2) & 0x04040404;
 								        const int vi = __vsubss4(vil, vih);
 								        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
 								    }
 								    return d3 * sumf;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
 								    const float & d3, const float & d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    int sumi = 0;
 								#pragma unroll
 								    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
 								        int sumi_sc = 0;
 								        for (int i = i0; i < i0 + QI8_1/2; ++i) {
 								            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
 								        }
 								        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
 								    }
 								    return d3*d8 * sumi;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q4_K_Q8_1_MMVQ 2
 								#define VDR_Q4_K_Q8_1_MMQ  8
 								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR4_K; ++i) {
 								        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
 								        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
 								        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
 								        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
 								        sumf_d += d8[i] * (dot1 * sc[i]);
 								        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
 								    }
 								    const float2 dm4f = __half22float2(dm4);
 								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
 								#pragma unroll
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        int sumi_d = 0;
 								#pragma unroll
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								        for (int j = 0; j < QI8_1; ++j) {
 								            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        }
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								        const float2 ds8f = __half22float2(ds8[i]);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								        sumf_d += ds8f.x * (sc[i] * sumi_d);
 								        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    }
 								    const float2 dm4f = __half22float2(dm4);
 								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								#define VDR_Q5_K_Q8_1_MMVQ 2
 								#define VDR_Q5_K_Q8_1_MMQ  8
 								// contiguous v/x values
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR5_K; ++i) {
 								        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
 								        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
 								        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
 								        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
 								        const int v0i = vl0i | vh0i;
 								        const int v1i = vl1i | vh1i;
 								        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
 								        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
 								        sumf_d += d8[i] * (dot1 * sc[i]);
 								        sumf_m += d8[i] * (dot2 * m[i]);
 								    }
 								    const float2 dm5f = __half22float2(dm5);
 								    return dm5f.x*sumf_d - dm5f.y*sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
 								    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
 								        int sumi_d = 0;
 								#pragma unroll
 								        for (int j = 0; j < QI8_1; ++j) {
 								            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
 								        }
 								        const float2 ds8f = __half22float2(ds8[i]);
 								        sumf_d += ds8f.x * (sc[i] * sumi_d);
 								        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
 								    }
 								    const float2 dm4f = __half22float2(dm4);
 								    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#define VDR_Q6_K_Q8_1_MMVQ 1
 								#define VDR_Q6_K_Q8_1_MMQ  8
 								// contiguous v/x values
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 								    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
 								    const float & d, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf = 0.0f;
 								#pragma unroll
 								    for (int i = 0; i < QR6_K; ++i) {
 								        const int sc = scales[4*i];
 								        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
 								        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
 								        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
 								        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
 								    }
 								    return d*sumf;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
 								// contiguous u/y values
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
 								    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
 								    const float & d6, const float * __restrict__ d8) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    float sumf_d = 0.0f;
 								#pragma unroll
 								    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
 								        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
 								#pragma unroll
 								        for (int i = i0; i < i0 + 2; ++i) {
 								            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
 								            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
 								            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
 								            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
 								        }
 								        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
 								    }
 								    return d6 * sumf_d;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int v[VDR_Q4_0_Q8_1_MMVQ];
 								    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
 								        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
 								    }
 								    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_qs;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    *x_dm = (half2 *) tile_x_d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int kbx  = k / QI4_0;
 								    const int kqsx = k % QI4_0;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    float * x_dmf = (float *) x_dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
 								        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const float * x_dmf = (const float *) x_dm;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int u[2*VDR_Q4_0_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int v[VDR_Q4_1_Q8_1_MMVQ];
 								    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
 								        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
 								    }
 								    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_qs;
 								    *x_dm = tile_x_dm;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI4_1;
 								    const int kqsx = k % QI4_1;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int u[2*VDR_Q4_1_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int vl[VDR_Q5_0_Q8_1_MMVQ];
 								    int vh[VDR_Q5_0_Q8_1_MMVQ];
 								    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
 								        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
 								        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
 								    }
 								    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_ql;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    *x_dm = (half2 *) tile_x_d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI5_0;
 								    const int kqsx = k % QI5_0;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								        const int ql = get_int_from_uint8(bxi->qs, kqsx);
 								        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
 								        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
 								        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
 								        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
 								        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
 								        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
 								        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
 								        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
 								        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
 								        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
 								        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
 								        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
 								        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    float * x_dmf = (float *) x_dm;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int u[2*VDR_Q5_0_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int vl[VDR_Q5_1_Q8_1_MMVQ];
 								    int vh[VDR_Q5_1_Q8_1_MMVQ];
 								    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
 								#pragma unroll
 								    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
 								        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
 								        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
 								        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
 								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset < nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI5_1;
 								    const int kqsx = k % QI5_1;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
 								        int qs0 = (ql >>  0) & 0x0F0F0F0F;
 								        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
 								        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
 								        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
 								        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
 								        int qs1 = (ql >>  4) & 0x0F0F0F0F;
 								        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
 								        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
 								        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
 								        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
 								        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
 								        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int u[2*VDR_Q5_1_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								#pragma unroll
 								    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
 								        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    int v[VDR_Q8_0_Q8_1_MMVQ];
 								    int u[VDR_Q8_0_Q8_1_MMVQ];
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
 								        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
 								        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 								    }
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_qs;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    *x_dm = (half2 *) tile_x_d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI8_0;
 								    const int kqsx = k % QI8_0;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    float * x_dmf = (float *) x_dm;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
 								    }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								#pragma unroll
 								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
 								        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh; (void)x_sc;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
 								    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
 								        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
 								    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
 								    const int bq8_offset = QR2_K * (iqs / QI8_1);
 								    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const uint8_t * scales = bq2_K->scales + scale_offset;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    int    u[QR2_K];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    float d8[QR2_K];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    for (int i = 0; i < QR2_K; ++ i) {
 								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI2_K;
 								    const int kqsx = k % QI2_K;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q2_K * bx0 = (const block_q2_K *) vx;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
 								    const int kbxd = k % blocks_per_tile_x_row;
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
 								        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
 								        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
 								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
 								        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
 								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int kbx = k / QI2_K;
 								    const int ky  = (k % QI2_K) * QR2_K;
 								    const float * y_df = (const float *) y_ds;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
 								    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
 								    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
 								        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
 								    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
 								    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const float d = bq3_K->d;
 								    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
 								    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
 								    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    int    u[QR3_K];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    float d8[QR3_K];
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    for (int i = 0; i < QR3_K; ++i) {
 								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
 								    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_qh = tile_x_qh;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int kbx  = k / QI3_K;
 								    const int kqsx = k % QI3_K;
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q3_K * bx0 = (const block_q3_K *) vx;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
 								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
 								    const int kbxd = k % blocks_per_tile_x_row;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    float * x_dmf = (float *) x_dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
 								        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
 								        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int ksc = k % (QI3_K/4);
 								        const int ksc_low = ksc % (QI3_K/8);
 								        const int shift_low = 4 * (ksc / (QI3_K/8));
 								        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
 								        const int ksc_high = QI3_K/8;
 								        const int shift_high = 2 * ksc;
 								        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
 								        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
 								        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
 								    const int kbx  = k / QI3_K;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int ky  = (k % QI3_K) * QR3_K;
 								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
 								    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
 								        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
 								        const int shift = 2 * ((ky % 32) / 8);
 								        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
 								        const int vlh = (vh << 2) & 0x04040404;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        v[l] = __vsubss4(vll, vlh);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								#ifndef GGML_QKK_64
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 								    int    v[2];
 								    int    u[2*QR4_K];
 								    float d8[QR4_K];
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
 								    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
 								    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
 								    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
 								    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    v[0] = q4[0];
 								    v[1] = q4[4];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
 								    uint16_t aux[2];
 								    const int j = bq8_offset/2;
 								    if (j < 2) {
 								        aux[0] = scales[j+0] & 0x3f3f;
 								        aux[1] = scales[j+2] & 0x3f3f;
 								    } else {
 								        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
 								        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
 								    }
 								    const uint8_t * sc = (const uint8_t *)aux;
 								    const uint8_t * m  = sc + 2;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												Speed up Q4_K (#2322)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 07:49:20 +02:00
+								    for (int i = 0; i < QR4_K; ++i) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								        d8[i] = __low2float(bq8i->ds);
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        u[2*i+0] = q8[0];
 								        u[2*i+1] = q8[4];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								    }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
 								#else
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 								    float sumf_d = 0.0f;
 								    float sumf_m = 0.0f;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								    uint16_t aux16[2];
 								    const uint8_t * s = (const uint8_t *)aux16;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const uint16_t * a = (const uint16_t *)bq4_K->scales;
 								    aux16[0] = a[0] & 0x0f0f;
 								    aux16[1] = (a[0] >> 4) & 0x0f0f;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								    const float dall = bq4_K->dm[0];
 								    const float dmin = bq4_K->dm[1];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const float d8_1 = __low2float(bq8_1[0].ds);
 								    const float d8_2 = __low2float(bq8_1[1].ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 								    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 								    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 								    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int v1 = q4[0];
 								    const int v2 = q4[4];
 								    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
 								    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
 								    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
 								    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
 								    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
 								    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
 								    return dall * sumf_d - dmin * sumf_m;
 								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 								#endif
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI4_K; // == k if QK_K == 256
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q4_K * bx0 = (const block_q4_K *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
 								        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
 								        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#if QK_K == 256
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#else
 								        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
 								#endif
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								        const int * scales = (const int *) bxi->scales;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
 								        const int ksc = k % (WARP_SIZE/8);
 								        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
 								        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
 								        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
 								        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
 								                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifndef GGML_QKK_64
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    int   vl[2];
 								    int   vh[2];
 								    int    u[2*QR5_K];
 								    float d8[QR5_K];
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
 								    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
 								    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    vl[0] = ql[0];
 								    vl[1] = ql[4];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    vh[0] = qh[0] >> bq8_offset;
 								    vh[1] = qh[4] >> bq8_offset;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
 								    uint16_t aux[2];
 								    const int j = bq8_offset/2;
 								    if (j < 2) {
 								        aux[0] = scales[j+0] & 0x3f3f;
 								        aux[1] = scales[j+2] & 0x3f3f;
 								    } else {
 								        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
 								        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
 								    }
 								    const uint8_t * sc = (const uint8_t *)aux;
 								    const uint8_t * m  = sc + 2;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												Some more Q4_K and Q5_K speedup on CUDA (#2346)

* Faster Q5_K on CUDA

* Small Q5_K improvement on older GPUs

* Spped up Q4_K on CUDA

GTX1660: 29.5 ms/t -> 25.6 ms/t
RTX4080: 8.40 ms/t -> 8.25 ms/t

* Spped up Q4_K on CUDA

GTX1660: 36.7 ms/t -> 35.6 ms/t
RTX4080:  9.8 ms/t ->  9.5 ms/t

* Address PR comments

* Add some comments to satisfy PR reviewer

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-23 23:19:47 +02:00
+								    for (int i = 0; i < QR5_K; ++i) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								        d8[i] = __low2float(bq8i->ds);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        u[2*i+0] = q8[0];
 								        u[2*i+1] = q8[4];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								    }
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
 								#else
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								    const int8_t * s = bq5_K->scales;
 								    const float d = bq5_K->d;
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								    const float d8_1 = __low2half(bq8_1[0].ds);
 								    const float d8_2 = __low2half(bq8_1[1].ds);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 								    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 								    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 								    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								    const int vl1 = ql[0];
 								    const int vl2 = ql[4];
-												CUDA: faster non k-quant mul_mat_q kernels (#2483)


											
										
										
											2023-08-02 18:04:04 +02:00
+								    const int step = 4 * (iqs/2); // 0, 4, 8, 12
 								    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								    const int in = step%8; // 0, 4, 0, 4
 								    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
 								    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
 								    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
 								    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
 								    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
 								    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
 								                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
 								    return d * sumf_d;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#else
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-												Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)

* Fix Q4_K and Q5_K for QK_K = 64

* Very slightly better Q5_K bit fiddling

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-25 12:48:04 +02:00
+								#endif
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI5_K; // == k if QK_K == 256
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q5_K * bx0 = (const block_q5_K *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int ky = QR5_K*kqsx;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
 								        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
 								        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
 								        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
 								        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
 								        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
 								        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
 								        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
 								        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
 								        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
 								        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#if QK_K == 256
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#endif
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								        const int * scales = (const int *) bxi->scales;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
 								        const int ksc = k % (WARP_SIZE/8);
 								        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
 								        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
 								        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
 								        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
 								    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
 								                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
 								    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
 								    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
 								    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
 								    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
 								    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int8_t * scales = bq6_K->scales + scale_offset;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    int    u[QR6_K];
 								    float d8[QR6_K];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								#pragma unroll
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
+								    for (int i = 0; i < QR6_K; ++i) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
 								    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
 								    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-												CUDA: mul_mat_vec_q kernels for k-quants (#2203)


											
										
										
											2023-07-14 19:44:08 +02:00
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    *x_ql = tile_x_ql;
 								    *x_dm = tile_x_dm;
 								    *x_sc = tile_x_sc;
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								    GGML_CUDA_ASSUME(i_offset >= 0);
 								    GGML_CUDA_ASSUME(i_offset <  nwarps);
 								    GGML_CUDA_ASSUME(k >= 0);
 								    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
 								    const int kqsx = k % QI6_K; // == k if QK_K == 256
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    const block_q6_K * bx0 = (const block_q6_K *) vx;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								        int i = i0 + i_offset;
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int ky = QR6_K*kqsx;
 								        const int ql = get_int_from_uint8(bxi->ql, kqsx);
 								        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
 								        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
 								        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
 								        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
 								        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
 								        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
 								        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
 								        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 								    float * x_dmf = (float *) x_dm;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
 								        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								    }
 								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
 								        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
 								        if (need_check) {
 								            i = min(i, i_max);
 								        }
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
 								        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
 								        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
 								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
 								    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
 								    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    (void)x_qh;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const float * x_dmf = (const float *) x_dm;
 								    const float * y_df  = (const float *) y_ds;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
 								    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
 								#if QR2_XXS == 8
 								    const int ib32 = iqs;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    uint32_t aux32 = q2[2] | (q2[3] << 16);
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
 								        for (int j = 0; j < 8; ++j) {
 								            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
 								        }
 								        q8 += 8;
 								        aux32 >>= 7;
 								    }
-												cuda : fix 2-bit quants on amd hip (#5105)

* cuda : fix 2-bit quants on amd hip

* use __low2float intrinsic function for new quants
											
										
										
											2024-01-24 23:18:15 +01:00
+								    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								    return d * sumi;
 								#else
 								    // iqs is 0...15
 								    const int ib32 = iqs/2;
 								    const int il = iqs%2;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const uint8_t  * aux8 = (const uint8_t *)q2;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
 								    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-												cuda : fix 2-bit quants on amd hip (#5105)

* cuda : fix 2-bit quants on amd hip

* use __low2float intrinsic function for new quants
											
										
										
											2024-01-24 23:18:15 +01:00
+								    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
 								    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
 								    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int j = 0; j < 8; ++j) {
 								        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
 								        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
 								    }
 								    return d * (sumi1 + sumi2);
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								#if QK_K == 256
 								    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
 								    const int ib32 = iqs;
 								    const uint16_t * q2 = bq2->qs + 4*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
 								    const uint8_t ls2 = bq2->scales[ib32] >>  4;
 								    int sumi1 = 0;
 								    for (int l = 0; l < 2; ++l) {
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
 								        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
 								        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
 								        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        q8 += 8;
 								    }
 								    int sumi2 = 0;
 								    for (int l = 2; l < 4; ++l) {
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
 								        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
 								        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
 								        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        q8 += 8;
 								    }
-												cuda : fix 2-bit quants on amd hip (#5105)

* cuda : fix 2-bit quants on amd hip

* use __low2float intrinsic function for new quants
											
										
										
											2024-01-24 23:18:15 +01:00
+								    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 								#else
-												ci : enable -Werror for CUDA builds (#5579)

* cmake : pass -Werror through -Xcompiler

ggml-ci

* make, cmake : enable CUDA errors on warnings

ggml-ci
											
										
										
											2024-02-19 13:45:41 +01:00
+								    (void) ksigns64;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								    assert(false);
 								    return 0.f;
 								#endif
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								#else
-												ci : enable -Werror for CUDA builds (#5579)

* cmake : pass -Werror through -Xcompiler

ggml-ci

* make, cmake : enable CUDA errors on warnings

ggml-ci
											
										
										
											2024-02-19 13:45:41 +01:00
+								    (void) ksigns64;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								    assert(false);
 								    return 0.f;
 								#endif
 								}
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								// TODO
 								static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
 								    const int ib32 = iqs;
 								    const int8_t  * q8 = bq8_1[ib32].qs;
 								    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
 								    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
 								    const uint8_t ls2 = bq2->scales[ib32] >>  4;
 								    int sumi1 = 0;
 								    for (int l = 0; l < 2; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
 								        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
 								        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
 								        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
 								        q8 += 8;
 								    }
 								    int sumi2 = 0;
 								    for (int l = 2; l < 4; ++l) {
 								        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
 								        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
 								        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
 								        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
 								        q8 += 8;
 								    }
 								    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
 								    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 								#else
 								    (void) ksigns64;
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    (void) ksigns64;
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
 								    const int ib32 = iqs;
 								    const uint8_t  * q3 = bq2->qs + 8*ib32;
 								    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    uint32_t aux32 = gas[0] | (gas[1] << 16);
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
 								        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
 								        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
 								        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
 								        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
 								        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
 								        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
 								        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
 								        q8 += 8;
 								        aux32 >>= 7;
 								    }
 								    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
 								    return d * sumi;
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								}
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								// TODO: don't use lookup table for signs
 								static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								#if QK_K == 256
 								    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
 								    const int ib32 = iqs;
 								    const uint8_t  * qs = bq2->qs + 8*ib32;
 								    const int8_t   * q8 = bq8_1[ib32].qs;
 								    int sumi = 0;
 								    for (int l = 0; l < 4; ++l) {
-												ggml : IQ3_S improvements (#5829)

* iq3_s: somewhat faster AVX2 dot product

On Ryzen a 7950X TG-128 increases to 16 t/s from 15.5 t/s using
16 threads. For 8 threads it is 13.85 t/s vs 11.75 t/s.
PP-512 increases to 28.5 t/s from 23.8 t/s.

* iq3_s: somewhat faster ARM_NEON dot product

Still dog slow - 10.7 t/s up from 9.9 t/s.

* iq3_s: another small ARM_NEON improvement

10.7 -> 11.0 t/s. Using vmulq_s8 is faster than the xor - sub trick
that works best on AVX2.

* iq3_s: minor improvement on Metal

49.4 t/s -> 50.3 t/s

* iq3_s: PPL improvement

E.g., for a context of 4096 LLaMA-v2-7B goes to 5.1340 from 5.1653.

* iq3_s: use new grid everywhere

* Fix ARM_NEON

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-02 16:00:51 +01:00
+								        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
 								        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
 								        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
 								        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
 								        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
 								        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
 								        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
 								        q8 += 8;
 								    }
-												ggml : IQ3_S improvements (#5829)

* iq3_s: somewhat faster AVX2 dot product

On Ryzen a 7950X TG-128 increases to 16 t/s from 15.5 t/s using
16 threads. For 8 threads it is 13.85 t/s vs 11.75 t/s.
PP-512 increases to 28.5 t/s from 23.8 t/s.

* iq3_s: somewhat faster ARM_NEON dot product

Still dog slow - 10.7 t/s up from 9.9 t/s.

* iq3_s: another small ARM_NEON improvement

10.7 -> 11.0 t/s. Using vmulq_s8 is faster than the xor - sub trick
that works best on AVX2.

* iq3_s: minor improvement on Metal

49.4 t/s -> 50.3 t/s

* iq3_s: PPL improvement

E.g., for a context of 4096 LLaMA-v2-7B goes to 5.1340 from 5.1653.

* iq3_s: use new grid everywhere

* Fix ARM_NEON

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-02 16:00:51 +01:00
+								    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								    return d * sumi;
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
 								    const int ib32 = iqs;
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								    int sumi = 0;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const int * q8 = (const int *)bq8_1[ib32].qs;
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								    for (int l = 0; l < 4; ++l) {
 								        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								        int grid0 = grid[0] & 0x0f0f0f0f;
 								        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								    }
 								#else
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								    const int8_t * q8 = bq8_1[ib32].qs;
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								    for (int l = 0; l < 4; ++l) {
 								        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
 								        for (int j = 0; j < 4; ++j) {
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
-												Better 1.5 bit quantization  (#5971)

* Trying blocvks of 16 for IQ1_S - seems slightly better

* iq1s_blocks16: Adjust scale fudge factor to 1.125

* iq1s_blocks16: going to blocks of 32

with 2048 lattice points, so same bpw.
This is even better than blocks of 16.
Should I try blocks of 64? But to keep the same
bpw, when I go to 4096 lattice points, I need to
remove blocks alltogether and just have superblocks of
256 weights.

* iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment

* iq1s_blocks16: scalar and AVX2 dot products

* iq1s_blocks16: CUDA dot product

* iq1s_blocks16: Metal works, Neon does not

Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s).
Not seeing the bug in the Neon implementation for now.

* iq1s_blocks16: fixed Neon

* iq1s_blocks16: very slightly faster TG on Metal

Still pathetic at 37 t/s

* iq1s_blocks16: speedup Metal by packing codebook into uint32_t's

* Formatting

* iq1s_blocks16: uint32_t codebook is also better in CUDA

TG-128 is now 204 t/s up from 194 t/s.
PP-512 is 5890 t/s, so significantly better than other quants

* iq1s_blocks16: slightly faster Neon dot product

* iq1s_blocks16: faster AVX2 dot product

* iq1s_blocks16: adjust to ggml-common.h

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 07:51:49 +01:00
+								        }
 								        q8 += 8;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								    }
 								#endif
-.5 bit: we can do even better (#5999)

* iq1_s: we can do even better

Spent one of the 4 scale bits on a signs of a 0.125 shift.
I.e., quants are now -1 + delta, delta, 1 + delta, where delta
is +/- 0.125.

CUDA works, same performance as before.
PPL(LLaMA-v2-7B) is now 11.85!

* iq1_s: make scalar and AVX2 work with the new version

* iq1_s: make Neon work with new version.

~10% drop in performance, so will need some more work.

* iq1_s: make Metal work with new version

* iq1_s: very slightly faster dequantize on Metal

* iq1_s: fix dequantize on the CPU

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-03-11 16:53:15 +01:00
+								    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
 								    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
 								    const float d = d1q * __low2float (bq8_1[ib32].ds);
 								    const float m = d1q * __high2float(bq8_1[ib32].ds);
 								    return d * sumi + m * delta;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
 								        int & val1, int & val2) {
 								    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
 								    aux32 = q4 & 0x0f0f0f0f;
 								    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
 								    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
 								    val1 = v1 | (v2 << 16);
 								    aux32 = (q4 >> 4) & 0x0f0f0f0f;
 								    v1 = values[q8[0]] | (values[q8[1]] << 8);
 								    v2 = values[q8[2]] | (values[q8[3]] << 8);
 								    val2 = v1 | (v2 << 16);
 								}
 								#endif
 								static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
 								    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
 								    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 								    int v1, v2;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
 								        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
 								        get_int_from_table_16(aux, values, v1, v2);
 								        sumi1 = __dp4a(v1, q8[l+0], sumi1);
 								        sumi2 = __dp4a(v2, q8[l+4], sumi2);
 								    }
 								#else
 								    const uint8_t * q4 = bq->qs + 4*iqs;
 								    const int8_t  * q8 = bq8_1->qs + 4*iqs;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
 								        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
 								        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
 								    }
 								#endif
 								    const float d = (float)bq->d * __low2float(bq8_1->ds);
 								    return d * (sumi1 + sumi2);
 								}
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
 								    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 								#if QK_K == 256
 								#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 								    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
 								    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 								    //// iqs is 0...7
 								    //const int ib64 = iqs/2;
 								    //const int il = iqs%2;
 								    //const int32_t  * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
 								    //const int32_t  * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
 								    //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
 								    //const uint32_t * q4_2 = q4_1 + 4;
 								    //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
 								    //const int8_t ls2 = (bq4->scales_l[ib64] >>  4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
 								    //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
 								    //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
 								    //int v1, v2;
 								    //int sumi1 = 0, sumi2 = 0;
 								    //for (int j = 0; j < 2; ++j) {
 								    //    get_int_from_table_16(q4_1[j], values, v1, v2);
 								    //    sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
 								    //    get_int_from_table_16(q4_2[j], values, v1, v2);
 								    //    sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
 								    //}
 								    //return d1 * sumi1 + d2 * sumi2;
 								    // iqs is 0...7
 								    const int ib32 = iqs;
 								    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
 								    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
 								    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
 								    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
 								    int v1, v2;
 								    int sumi1 = 0, sumi2 = 0;
 								    for (int j = 0; j < 4; ++j) {
 								        get_int_from_table_16(q4[j], values, v1, v2);
 								        sumi1 = __dp4a(v1, q8[j+0], sumi1);
 								        sumi2 = __dp4a(v2, q8[j+4], sumi2);
 								    }
 								    return d * (sumi1 + sumi2);
 								    //// iqs is 0...15
 								    //const int ib32 = iqs/2;
 								    //const int il = iqs%2;
 								    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs + 2*il;
 								    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
 								    //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
 								    //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
 								    //int v1, v2;
 								    //int sumi1 = 0, sumi2 = 0;
 								    //for (int j = 0; j < 2; ++j) {
 								    //    get_int_from_table_16(q4[j], values, v1, v2);
 								    //    sumi1 = __dp4a(v1, q8[j+0], sumi1);
 								    //    sumi2 = __dp4a(v2, q8[j+4], sumi2);
 								    //}
 								    //return d * (sumi1 + sumi2);
 								#else
 								    assert(false);
 								    return 0.f;
 								#endif
 								#else
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-28 09:37:02 +01:00
+								    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								#endif
 								}
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								static __device__ __forceinline__ void mul_mat_q(
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 								    const block_q_t  * x = (const block_q_t  *) vx;
 								    const block_q8_1 * y = (const block_q8_1 *) vy;
 								    const int blocks_per_row_x = ncols_x / qk;
 								    const int blocks_per_col_y = nrows_y / QK8_1;
 								    const int blocks_per_warp = WARP_SIZE / qi;
 								    const int & ncols_dst = ncols_y;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int row_dst_0 = blockIdx.x*mmq_y;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int & row_x_0 = row_dst_0;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    const int col_dst_0 = blockIdx.y*mmq_x;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    const int & col_y_0 = col_dst_0;
 								    int   * tile_x_ql = nullptr;
 								    half2 * tile_x_dm = nullptr;
 								    int   * tile_x_qh = nullptr;
 								    int   * tile_x_sc = nullptr;
 								    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
 								    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												Clean up ggml-cuda.cu warnings when compiling with clang (for ROCM) (#4124)

* ggml-cuda.cu: Clean up warnings when compiling with clang

* ggml-cuda.cu: Move static items into anonymous namespace

* ggml-cuda.cu: Fix use of namespace start macro

* Revert "ggml-cuda.cu: Fix use of namespace start macro"

This reverts commit 26c11490266c096e3e5731e05270a8f73a5b2874.

* Revert "ggml-cuda.cu: Move static items into anonymous namespace"

This reverts commit e29757e0f7535d1ac314300f0324684cc785e06c.
											
										
										
											2023-11-18 16:11:18 +01:00
+								    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								#pragma unroll
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        for (int ir = 0; ir < qr; ++ir) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								            const int kqs = ir*WARP_SIZE + threadIdx.x;
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								            const int kbxd = kqs / QI8_1;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								#pragma unroll
 								            for (int i = 0; i < mmq_x; i += nwarps) {
 								                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: fewer memory bank conflicts for mul_mat_q (#2458)


											
										
										
											2023-07-31 13:18:51 +02:00
+								                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
 								                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								#pragma unroll
 								            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
 								                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
 								                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
 								                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
 								                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
 								                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
 								                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
 								                if (need_sum) {
 								                    *dsi_dst = *dsi_src;
 								                } else {
 								                    float * dfi_dst = (float *) dsi_dst;
-												ggml : reuse quantum structs across backends (#5943)

* ggml : reuse quant blocks across backends

ggml-ci

* ggml : define helper constants only for CUDA and SYCL

ggml-ci

* ggml : define helper quantum constants for SYCL

ggml-ci
											
										
										
											2024-03-12 13:27:20 +01:00
+								                    *dfi_dst = __low2float(*dsi_src);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								                }
-												CUDA: faster k-quant mul_mat_q kernels (#2525)


											
										
										
											2023-08-05 18:20:44 +02:00
+								            }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								            __syncthreads();
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								// #pragma unroll // unrolling this loop causes too much register pressure
 								            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								                for (int j = 0; j < mmq_x; j += nwarps) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
 								                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
 								                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
 								                            threadIdx.x + i, threadIdx.y + j, k);
 								                    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								                }
 								            }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								            __syncthreads();
 								        }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								    }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    for (int j = 0; j < mmq_x; j += nwarps) {
 								        const int col_dst = col_dst_0 + j + threadIdx.y;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								        if (col_dst >= ncols_dst) {
 								            return;
 								        }
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#pragma unroll
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								            const int row_dst = row_dst_0 + threadIdx.x + i;
 								            if (row_dst >= nrows_dst) {
 								                continue;
 								            }
 								            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								        }
 								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q4_0_RDNA2  64
 								#define  MMQ_Y_Q4_0_RDNA2  128
 								#define NWARPS_Q4_0_RDNA2  8
 								#define  MMQ_X_Q4_0_RDNA1  64
 								#define  MMQ_Y_Q4_0_RDNA1  64
 								#define NWARPS_Q4_0_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_0_AMPERE 4
 								#define  MMQ_Y_Q4_0_AMPERE 32
 								#define NWARPS_Q4_0_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q4_0_AMPERE 64
 								#define  MMQ_Y_Q4_0_AMPERE 128
 								#define NWARPS_Q4_0_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q4_0_PASCAL 64
 								#define  MMQ_Y_Q4_0_PASCAL 64
 								#define NWARPS_Q4_0_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q4_0(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
 								    const int nwarps = NWARPS_Q4_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
 								    const int nwarps = NWARPS_Q4_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
 								    const int nwarps = NWARPS_Q4_0_AMPERE;
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
 								    const int nwarps = NWARPS_Q4_0_PASCAL;
 								    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
 								        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q4_0_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q4_1_RDNA2  64
 								#define  MMQ_Y_Q4_1_RDNA2  128
 								#define NWARPS_Q4_1_RDNA2  8
 								#define  MMQ_X_Q4_1_RDNA1  64
 								#define  MMQ_Y_Q4_1_RDNA1  64
 								#define NWARPS_Q4_1_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_1_AMPERE 4
 								#define  MMQ_Y_Q4_1_AMPERE 32
 								#define NWARPS_Q4_1_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q4_1_AMPERE 64
 								#define  MMQ_Y_Q4_1_AMPERE 128
 								#define NWARPS_Q4_1_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q4_1_PASCAL 64
 								#define  MMQ_Y_Q4_1_PASCAL 64
 								#define NWARPS_Q4_1_PASCAL 8
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								template <bool need_check> static __global__ void
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    mul_mat_q4_1(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
 								    const int nwarps = NWARPS_Q4_1_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
 								    const int nwarps = NWARPS_Q4_1_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
 								    const int nwarps = NWARPS_Q4_1_AMPERE;
 								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
 								    const int nwarps = NWARPS_Q4_1_PASCAL;
 								    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
 								        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q4_1_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q5_0_RDNA2  64
 								#define  MMQ_Y_Q5_0_RDNA2  128
 								#define NWARPS_Q5_0_RDNA2  8
 								#define  MMQ_X_Q5_0_RDNA1  64
 								#define  MMQ_Y_Q5_0_RDNA1  64
 								#define NWARPS_Q5_0_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_0_AMPERE 4
 								#define  MMQ_Y_Q5_0_AMPERE 32
 								#define NWARPS_Q5_0_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_0_AMPERE 128
 								#define  MMQ_Y_Q5_0_AMPERE 64
 								#define NWARPS_Q5_0_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_0_PASCAL 64
 								#define  MMQ_Y_Q5_0_PASCAL 64
 								#define NWARPS_Q5_0_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q5_0(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
 								    const int nwarps = NWARPS_Q5_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
 								    const int nwarps = NWARPS_Q5_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
 								    const int nwarps = NWARPS_Q5_0_AMPERE;
 								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
 								    const int nwarps = NWARPS_Q5_0_PASCAL;
 								    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
 								        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q5_0_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q5_1_RDNA2  64
 								#define  MMQ_Y_Q5_1_RDNA2  128
 								#define NWARPS_Q5_1_RDNA2  8
 								#define  MMQ_X_Q5_1_RDNA1  64
 								#define  MMQ_Y_Q5_1_RDNA1  64
 								#define NWARPS_Q5_1_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_1_AMPERE 4
 								#define  MMQ_Y_Q5_1_AMPERE 32
 								#define NWARPS_Q5_1_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_1_AMPERE 128
 								#define  MMQ_Y_Q5_1_AMPERE 64
 								#define NWARPS_Q5_1_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_1_PASCAL 64
 								#define  MMQ_Y_Q5_1_PASCAL 64
 								#define NWARPS_Q5_1_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q5_1(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
 								    const int nwarps = NWARPS_Q5_1_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
 								    const int nwarps = NWARPS_Q5_1_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
 								    const int nwarps = NWARPS_Q5_1_AMPERE;
 								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
 								    const int nwarps = NWARPS_Q5_1_PASCAL;
 								    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
 								        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q5_1_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q8_0_RDNA2  64
 								#define  MMQ_Y_Q8_0_RDNA2  128
 								#define NWARPS_Q8_0_RDNA2  8
 								#define  MMQ_X_Q8_0_RDNA1  64
 								#define  MMQ_Y_Q8_0_RDNA1  64
 								#define NWARPS_Q8_0_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q8_0_AMPERE 4
 								#define  MMQ_Y_Q8_0_AMPERE 32
 								#define NWARPS_Q8_0_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q8_0_AMPERE 128
 								#define  MMQ_Y_Q8_0_AMPERE 64
 								#define NWARPS_Q8_0_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q8_0_PASCAL 64
 								#define  MMQ_Y_Q8_0_PASCAL 64
 								#define NWARPS_Q8_0_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    mul_mat_q8_0(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
 								    const int nwarps = NWARPS_Q8_0_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
 								    const int nwarps = NWARPS_Q8_0_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
 								    const int nwarps = NWARPS_Q8_0_AMPERE;
 								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
 								    const int nwarps = NWARPS_Q8_0_PASCAL;
 								    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
 								        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q8_0_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q2_K_RDNA2  64
 								#define  MMQ_Y_Q2_K_RDNA2  128
 								#define NWARPS_Q2_K_RDNA2  8
 								#define  MMQ_X_Q2_K_RDNA1  128
 								#define  MMQ_Y_Q2_K_RDNA1  32
 								#define NWARPS_Q2_K_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q2_K_AMPERE 4
 								#define  MMQ_Y_Q2_K_AMPERE 32
 								#define NWARPS_Q2_K_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q2_K_AMPERE 64
 								#define  MMQ_Y_Q2_K_AMPERE 128
 								#define NWARPS_Q2_K_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q2_K_PASCAL 64
 								#define  MMQ_Y_Q2_K_PASCAL 64
 								#define NWARPS_Q2_K_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q2_K(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
 								    const int nwarps = NWARPS_Q2_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
 								    const int nwarps = NWARPS_Q2_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
 								    const int nwarps = NWARPS_Q2_K_AMPERE;
 								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
 								    const int nwarps = NWARPS_Q2_K_PASCAL;
 								    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
 								        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q2_K_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q3_K_RDNA2  128
 								#define  MMQ_Y_Q3_K_RDNA2  64
 								#define NWARPS_Q3_K_RDNA2  8
 								#define  MMQ_X_Q3_K_RDNA1  32
 								#define  MMQ_Y_Q3_K_RDNA1  128
 								#define NWARPS_Q3_K_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q3_K_AMPERE 4
 								#define  MMQ_Y_Q3_K_AMPERE 32
 								#define NWARPS_Q3_K_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q3_K_AMPERE 128
 								#define  MMQ_Y_Q3_K_AMPERE 128
 								#define NWARPS_Q3_K_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q3_K_PASCAL 64
 								#define  MMQ_Y_Q3_K_PASCAL 64
 								#define NWARPS_Q3_K_PASCAL 8
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								template <bool need_check> static __global__ void
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    mul_mat_q3_K(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
 								    const int nwarps = NWARPS_Q3_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
 								    const int nwarps = NWARPS_Q3_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
 								    const int nwarps = NWARPS_Q3_K_AMPERE;
 								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
 								    const int nwarps = NWARPS_Q3_K_PASCAL;
 								    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
 								        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q3_K_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q4_K_RDNA2  64
 								#define  MMQ_Y_Q4_K_RDNA2  128
 								#define NWARPS_Q4_K_RDNA2  8
 								#define  MMQ_X_Q4_K_RDNA1  32
 								#define  MMQ_Y_Q4_K_RDNA1  64
 								#define NWARPS_Q4_K_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q4_K_AMPERE 4
 								#define  MMQ_Y_Q4_K_AMPERE 32
 								#define NWARPS_Q4_K_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q4_K_AMPERE 64
 								#define  MMQ_Y_Q4_K_AMPERE 128
 								#define NWARPS_Q4_K_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								#define  MMQ_X_Q4_K_PASCAL 64
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_Y_Q4_K_PASCAL 64
 								#define NWARPS_Q4_K_PASCAL 8
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								template <bool need_check> static __global__ void
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    mul_mat_q4_K(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
 								    const int nwarps = NWARPS_Q4_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
 								    const int nwarps = NWARPS_Q4_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
 								    const int nwarps = NWARPS_Q4_K_AMPERE;
 								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
 								    const int nwarps = NWARPS_Q4_K_PASCAL;
 								    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
 								        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q4_K_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q5_K_RDNA2  64
 								#define  MMQ_Y_Q5_K_RDNA2  128
 								#define NWARPS_Q5_K_RDNA2  8
 								#define  MMQ_X_Q5_K_RDNA1  32
 								#define  MMQ_Y_Q5_K_RDNA1  64
 								#define NWARPS_Q5_K_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q5_K_AMPERE 4
 								#define  MMQ_Y_Q5_K_AMPERE 32
 								#define NWARPS_Q5_K_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_K_AMPERE 64
 								#define  MMQ_Y_Q5_K_AMPERE 128
 								#define NWARPS_Q5_K_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q5_K_PASCAL 64
 								#define  MMQ_Y_Q5_K_PASCAL 64
 								#define NWARPS_Q5_K_PASCAL 8
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								template <bool need_check> static __global__ void
 								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								mul_mat_q5_K(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
 								    const int nwarps = NWARPS_Q5_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
 								    const int nwarps = NWARPS_Q5_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
 								    const int nwarps = NWARPS_Q5_K_AMPERE;
 								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
 								    const int nwarps = NWARPS_Q5_K_PASCAL;
 								    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
 								        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q5_K_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#define  MMQ_X_Q6_K_RDNA2  64
 								#define  MMQ_Y_Q6_K_RDNA2  128
 								#define NWARPS_Q6_K_RDNA2  8
 								#define  MMQ_X_Q6_K_RDNA1  32
 								#define  MMQ_Y_Q6_K_RDNA1  64
 								#define NWARPS_Q6_K_RDNA1  8
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(CUDA_USE_TENSOR_CORES)
 								#define  MMQ_X_Q6_K_AMPERE 4
 								#define  MMQ_Y_Q6_K_AMPERE 32
 								#define NWARPS_Q6_K_AMPERE 4
 								#else
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_X_Q6_K_AMPERE 64
 								#define  MMQ_Y_Q6_K_AMPERE 64
 								#define NWARPS_Q6_K_AMPERE 4
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#endif
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								#define  MMQ_X_Q6_K_PASCAL 64
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								#define  MMQ_Y_Q6_K_PASCAL 64
 								#define NWARPS_Q6_K_PASCAL 8
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								template <bool need_check> static __global__ void
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
 								#endif // defined(RDNA3) || defined(RDNA2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ < CC_VOLTA
-												CUDA: launch_bounds, small q4_K, q5_K mmq refactor (#2596)


											
										
										
											2023-08-14 10:41:22 +02:00
+								    mul_mat_q6_K(
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								#if defined(RDNA3) || defined(RDNA2)
 								    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
 								    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
 								    const int nwarps = NWARPS_Q6_K_RDNA2;
 								#else
 								    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
 								    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
 								    const int nwarps = NWARPS_Q6_K_RDNA1;
 								#endif // defined(RDNA3) || defined(RDNA2)
 								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#elif __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
 								    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
 								    const int nwarps = NWARPS_Q6_K_AMPERE;
 								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#elif __CUDA_ARCH__ >= MIN_CC_DP4A
 								    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
 								    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
 								    const int nwarps = NWARPS_Q6_K_PASCAL;
 								    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
 								        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
 								        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								#else
 								    (void) vec_dot_q6_K_q8_1_mul_mat;
-												CUDA: more info when no device code (#5088)


											
										
										
											2024-01-23 13:31:56 +01:00
+								    NO_DEVICE_CODE;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								#endif // __CUDA_ARCH__ >= CC_VOLTA
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								}
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								// tell the compiler to use as many registers as it wants, see nwarps definition below
 								__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								static __global__ void mul_mat_vec_q(
 								    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
 								    constexpr int nwarps              = 1;
 								    constexpr int rows_per_cuda_block = 1;
 								#else
 								    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
 								    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
 								    const     int row0 = rows_per_cuda_block*blockIdx.x;
 								    const     int blocks_per_row_x = ncols_x / qk;
 								    const     int blocks_per_col_y = nrows_y / QK8_1;
 								    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
 								// partial sum for each thread
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
 								    const block_q_t  * x = (const block_q_t  *) vx;
 								    const block_q8_1 * y = (const block_q8_1 *) vy;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
 								        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        // x block quant index when casting the quants to int
 								        const int kqs = vdr * (tid % (qi/vdr));
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								#pragma unroll
 								        for (int j = 0; j < ncols_y; ++j) {
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								#pragma unroll
 								            for (int i = 0; i < rows_per_cuda_block; ++i) {
 								                tmp[j][i] += vec_dot_q_cuda(
 								                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
 								            }
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    }
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								    if (threadIdx.y > 0) {
 								#pragma unroll
 								        for (int j = 0; j < ncols_y; ++j) {
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								#pragma unroll
 								            for (int i = 0; i < rows_per_cuda_block; ++i) {
 								                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
 								            }
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								        }
 								    }
 								    __syncthreads();
 								    if (threadIdx.y > 0) {
 								        return;
 								    }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    // sum up partial sums and write back result
 								#pragma unroll
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								    for (int j = 0; j < ncols_y; ++j) {
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								#pragma unroll
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        for (int i = 0; i < rows_per_cuda_block; ++i) {
 								#pragma unroll
 								            for (int l = 0; l < nwarps-1; ++l) {
 								                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
 								            }
 								            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        if (threadIdx.x < rows_per_cuda_block) {
 								            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								        }
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    }
 								}
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    // qk = quantized weights per x block
 								    // qr = number of quantized weights per data value in x block
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    if (row >= nrows) {
 								        return;
 								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const int tid = threadIdx.x;
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    const int iter_stride = 2*GGML_CUDA_DMMV_X;
 								    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    const int y_offset = qr == 1 ? 1 : qk/2;
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								// partial sum for each thread
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
 								#else
 								    float tmp = 0.0f;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    for (int i = 0; i < ncols; i += iter_stride) {
 								        const int col = i + vals_per_iter*tid;
 								        const int ib = (row*ncols + col)/qk; // x block index
 								        const int iqs = (col%qk)/qr; // x quant index
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								        const int iybs = col - col%qk; // y block start index
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								// processing >2 values per i iter is faster for fast GPUs
 								#pragma unroll
 								        for (int j = 0; j < vals_per_iter; j += 2) {
 								            // process 2 vals per j iter
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								            // dequantize
 								            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								            dfloat2 v;
 								            dequantize_kernel(vx, ib, iqs + j/qr, v);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
 								            // matrix multiplication
 								            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								            tmp += __hmul2(v, {
 								                y[iybs + iqs + j/qr + 0],
 								                y[iybs + iqs + j/qr + y_offset]
 								            });
 								#else
 								            tmp += v.x * y[iybs + iqs + j/qr + 0];
 								            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								        }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    }
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    if (tid == 0) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								        dst[row] = tmp.x + tmp.y;
 								#else
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								        dst[row] = tmp;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								    }
 								}
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								static __global__ void mul_mat_p021_f16_f32(
 								    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
 								    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
-												cuda : fix missing const qualifier in casts (#2027)


											
										
										
											2023-06-28 19:26:26 +02:00
+								    const half * x = (const half *) vx;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const int channel_x = channel / (nchannels_y / nchannels_x);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int nrows_y = ncols_x;
 								    const int nrows_dst = nrows_x;
 								    const int row_dst = row_x;
 								    float tmp = 0.0f;
 								    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
 								        const int col_x = col_x0 + threadIdx.x;
 								        if (col_x >= ncols_x) {
 								            break;
 								        }
 								        // x is transposed and permuted
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        const float xi = __half2float(x[ix]);
 								        const int row_y = col_x;
 								        // y is not transposed but permuted
 								        const int iy = channel*nrows_y + row_y;
 								        tmp += xi * y[iy];
 								    }
 								    // dst is not transposed and not permuted
 								    const int idst = channel*nrows_dst + row_dst;
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    if (threadIdx.x == 0) {
 								        dst[idst] = tmp;
 								    }
 								}
 								static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-												CUDA: add __restrict__ to mul mat vec kernels (#2140)


											
										
										
											2023-07-08 00:25:15 +02:00
+								    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												cuda : fix missing const qualifier in casts (#2027)


											
										
										
											2023-06-28 19:26:26 +02:00
+								    const half * x = (const half *) vx;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const int channel_x = channel / channel_x_divisor;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    const int nrows_y   = ncols_x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const int nrows_dst = nrows_x;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    const int row_dst   = row_x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int idst = channel*nrows_dst + row_dst;
 								    float tmp = 0.0f;
 								    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
 								        const int col_x = col_x0 + threadIdx.x;
 								        if (col_x >= ncols_x) {
 								            break;
 								        }
 								        const int row_y = col_x;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        const int iy = channel*nrows_y + row_y;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								        const float xi = __half2float(x[ix]);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        tmp += xi * y[iy];
 								    }
 								    // sum up partial sums and write back result
-												cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)


											
										
										
											2024-02-27 14:22:45 +01:00
+								    tmp = warp_reduce_sum(tmp);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    if (threadIdx.x == 0) {
 								        dst[idst] = tmp;
 								    }
 								}
 								static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-												cuda : fix missing const qualifier in casts (#2027)


											
										
										
											2023-06-28 19:26:26 +02:00
+								    const float * xi = (const float *) cxi;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    float * dsti = (float *) cdsti;
 								    *dsti = *xi;
 								}
 								static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-												cuda : fix missing const qualifier in casts (#2027)


											
										
										
											2023-06-28 19:26:26 +02:00
+								    const float * xi = (const float *) cxi;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    half * dsti = (half *) cdsti;
 								    *dsti = __float2half(*xi);
 								}
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
 								    const half * xi = (const half *) cxi;
 								    half * dsti = (half *) cdsti;
 								    *dsti = *xi;
 								}
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 								    const half * xi = (const half *) cxi;
 								    float * dsti = (float *) cdsti;
 								    *dsti = *xi;
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								template <cpy_kernel_t cpy_1>
 								static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
 								                                   const int nb12, const int nb13) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    if (i >= ne) {
 								        return;
 								    }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    // then combine those indices with the corresponding byte offsets to get the total offsets
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int64_t i03 = i/(ne00 * ne01 * ne02);
 								    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
 								    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
 								    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
 								    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
 								    const int64_t i13 = i/(ne10 * ne11 * ne12);
 								    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
 								    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
 								    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
 								    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    cpy_1(cx + x_offset, cdst + dst_offset);
 								}
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q8_0 * dsti = (block_q8_0 *) cdsti;
 								    float amax = 0.0f; // absolute max
 								    for (int j = 0; j < QK8_0; j++) {
 								        const float v = xi[j];
 								        amax = fmaxf(amax, fabsf(v));
 								    }
 								    const float d = amax / ((1 << 7) - 1);
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->d = d;
 								    for (int j = 0; j < QK8_0; ++j) {
 								        const float x0 = xi[j]*id;
 								        dsti->qs[j] = roundf(x0);
 								    }
 								}
 								static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q4_0 * dsti = (block_q4_0 *) cdsti;
 								    float amax = 0.0f;
 								    float vmax = 0.0f;
 								    for (int j = 0; j < QK4_0; ++j) {
 								        const float v = xi[j];
 								        if (amax < fabsf(v)) {
 								            amax = fabsf(v);
 								            vmax = v;
 								        }
 								    }
 								    const float d  = vmax / -8;
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->d = d;
 								    for (int j = 0; j < QK4_0/2; ++j) {
 								        const float x0 = xi[0       + j]*id;
 								        const float x1 = xi[QK4_0/2 + j]*id;
 								        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
 								        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
 								        dsti->qs[j]  = xi0;
 								        dsti->qs[j] |= xi1 << 4;
 								    }
 								}
 								static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
 								    const float * xi = (const float *) cxi;
 								    block_q4_1 * dsti = (block_q4_1 *) cdsti;
 								    float vmin = FLT_MAX;
 								    float vmax = -FLT_MAX;
 								    for (int j = 0; j < QK4_1; ++j) {
 								        const float v = xi[j];
 								        if (v < vmin) vmin = v;
 								        if (v > vmax) vmax = v;
 								    }
 								    const float d  = (vmax - vmin) / ((1 << 4) - 1);
 								    const float id = d ? 1.0f/d : 0.0f;
 								    dsti->dm.x = d;
 								    dsti->dm.y = vmin;
 								    for (int j = 0; j < QK4_1/2; ++j) {
 								        const float x0 = (xi[0       + j] - vmin)*id;
 								        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
 								        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
 								        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
 								        dsti->qs[j]  = xi0;
 								        dsti->qs[j] |= xi1 << 4;
 								    }
 								}
 								template <cpy_kernel_t cpy_blck, int qk>
 								static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
 								                                 const int nb12, const int nb13) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
 								    if (i >= ne) {
 								        return;
 								    }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int i03 = i/(ne00 * ne01 * ne02);
 								    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
 								    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
 								    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
 								    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int i13 = i/(ne10 * ne11 * ne12);
 								    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
 								    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
 								    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
 								    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
 								    cpy_blck(cx + x_offset, cdst + dst_offset);
 								}
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
 								    const float y = (i0 / 2 - low) / max(0.001f, high - low);
 								    return 1.0f - min(1.0f, max(0.0f, y));
 								}
 								struct rope_corr_dims {
 								    float v[4];
 								};
 								// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 								// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 								static __device__ void rope_yarn(
 								    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
 								    float * cos_theta, float * sin_theta
 								) {
 								    // Get n-d rotational scaling corrected for extrapolation
 								    float theta_interp = freq_scale * theta_extrap;
 								    float theta = theta_interp;
 								    if (ext_factor != 0.0f) {
 								        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
 								        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        // Get n-d magnitude scaling corrected for interpolation
 								        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
 								    }
 								    *cos_theta = cosf(theta) * mscale;
 								    *sin_theta = sinf(theta) * mscale;
 								}
 								// rope == RoPE == rotary positional embedding
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								template<typename T, bool has_pos>
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static __global__ void rope(
 								    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
 								    float ext_factor, float attn_factor, rope_corr_dims corr_dims
 								) {
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    if (col >= ncols) {
 								        return;
 								    }
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int i = row*ncols + col;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int i2 = row/p_delta_rows;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int p = has_pos ? pos[i2] : 0;
-												cuda : fix RoPE after #2268 (#3897)


											
										
										
											2023-11-02 06:49:44 +01:00
+								    const float theta_base = p*powf(freq_base, -float(col)/ncols);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
 								    float cos_theta, sin_theta;
 								    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    const float x0 = x[i + 0];
 								    const float x1 = x[i + 1];
 								    dst[i + 0] = x0*cos_theta - x1*sin_theta;
 								    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								template<typename T, bool has_pos>
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static __global__ void rope_neox(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								) {
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
+								    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 								    if (col >= ncols) {
 								        return;
 								    }
 								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    const int ib = col / n_dims;
 								    const int ic = col % n_dims;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    if (ib > 0) {
 								        const int i = row*ncols + ib*n_dims + ic;
 								        dst[i + 0] = x[i + 0];
 								        dst[i + 1] = x[i + 1];
 								        return;
 								    }
 								    const int i  = row*ncols + ib*n_dims + ic/2;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int i2 = row/p_delta_rows;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    float cur_rot = inv_ndims * ic - ib;
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int p = has_pos ? pos[i2] : 0;
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
 								    float cos_theta, sin_theta;
 								    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
 								    const float x0 = x[i + 0];
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    const float x1 = x[i + n_dims/2];
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
 								    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
+								}
-												llm : add Falcon support (#2717)

* llama : refactor GGUF constants into static maps

* llama : check if model architecture is known

* llama : refactor llama_model_load_internal()

* gguf : add KV constant maps

* llm : read arch-specific KVs

* convert : add dummy scores + types

* falcon : load tensor data (CPU only)

* llama : fix loading progress bar

* llama : add arch member to llama_model

* falcon : CPU inference working

* falcon : support non-40B models

* falcon : minor

* llama : minor updates

ggml-ci

* convert-falcon-hf-to-gguf.py : fix special token mapping

* llama.cpp : llama default UNK token = id 0

* llama.cpp : fix bpe tokenizer

* llama.cpp : fix the fix of bpe tokenizer

* ggml : pass eps to ggml_norm

* metal : implement RoPE (mode = 2) + avoid ggml_repeat

* ggml : ggml_repeat always creates new tensor

* falcon : copy-paste self-attention from LLaMA

* metal : print extra compute pipeline info

* falcon : minor changes (still chasing the Metal problem)

* llama.cpp : fix linefeed token

* metal : fix GELU kernel numerical stability by using precise::tanh

* metal : temporary workaround for the concurrency optimization bug

* falcon : add CUDA offloading (#2739)

* llama : better model naming and size reporting

* llama : prep new tokenizer support

* llama : advanced BPE tokenizer based on ggllm.cpp imlpementation

* llama : remove oboslete comment

ggml-ci

* common : remove obsolete BPE API + disable test-tokenizer-1

* llama : revert BPE special-case in llama_byte_to_token()

* cuda : add TODOs for RoPE NeoX implementation

* llama : default special tokens based on vocab type

* perplexity : add log for start of tokenization

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-08-23 22:08:04 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static __global__ void rope_glm_f32(
 								    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
 								    int n_ctx
 								) {
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    const int col = blockDim.x*blockIdx.x + threadIdx.x;
 								    const int half_n_dims = ncols/4;
 								    if (col >= half_n_dims) {
 								        return;
 								    }
 								    const int row = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i = row*ncols + col;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int i2 = row/p_delta_rows;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								     // FIXME: this is likely wrong
 								    const int p = pos != nullptr ? pos[i2] : 0;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    const float sin_theta = sinf(theta);
 								    const float cos_theta = cosf(theta);
 								    const float x0 = x[i + 0];
 								    const float x1 = x[i + half_n_dims];
 								    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
 								    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    const float sin_block_theta = sinf(block_theta);
 								    const float cos_block_theta = cosf(block_theta);
 								    const float x2 = x[i + half_n_dims * 2];
 								    const float x3 = x[i + half_n_dims * 3];
 								    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
 								    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
 								}
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
 								                                 const int n_heads_log2_floor, const float m0, const float m1) {
 								    const int col = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (col >= ncols) {
 								        return;
 								    }
 								    const int row = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int i = row*ncols + col;
 								    const int k = row/k_rows;
 								    float m_k;
 								    if (k < n_heads_log2_floor) {
 								        m_k = powf(m0, k + 1);
 								    } else {
 								        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
 								    }
 								    dst[i] = col * m_k + x[i];
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    const int row = blockIdx.x;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const int col = threadIdx.x;
 								    float sum = 0.0f;
 								    for (int i = col; i < ncols; i += blockDim.x) {
 								        sum += x[row * ncols + i];
 								    }
 								    sum = warp_reduce_sum(sum);
 								    if (col == 0) {
 								        dst[row] = sum;
 								    }
 								}
 								template<typename T>
 								static inline __device__ void swap(T & a, T & b) {
 								    T tmp = a;
 								    a = b;
 								    b = tmp;
 								}
 								template<ggml_sort_order order>
 								static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
 								    // bitonic sort
 								    int col = threadIdx.x;
 								    int row = blockIdx.y;
 								    if (col >= ncols) return;
 								    const float * x_row = x + row * ncols;
 								    int * dst_row = dst + row * ncols;
 								    // initialize indices
 								    if (col < ncols) {
 								        dst_row[col] = col;
 								    }
 								    __syncthreads();
 								    for (int k = 2; k <= ncols; k *= 2) {
 								        for (int j = k / 2; j > 0; j /= 2) {
 								            int ixj = col ^ j;
 								            if (ixj > col) {
 								                if ((col & k) == 0) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								                        swap(dst_row[col], dst_row[ixj]);
 								                    }
 								                } else {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								                        swap(dst_row[col], dst_row[ixj]);
 								                    }
 								                }
 								            }
 								            __syncthreads();
 								        }
 								    }
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const int col = blockDim.y*blockIdx.y + threadIdx.y;
 								    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    if (col >= ncols) {
 								        return;
 								    }
 								    const int i = row*ncols + col;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
 								    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
 								    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								template <bool vals_smem, int ncols_template, int block_size_template>
 								static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
 								    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
 								    const int tid  = threadIdx.x;
 								    const int rowx = blockIdx.x;
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
 								    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
 								    const int warp_id = threadIdx.x / WARP_SIZE;
 								    const int lane_id = threadIdx.x % WARP_SIZE;
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    float slope = 0.0f;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    // ALiBi
 								    if (max_bias > 0.0f) {
 								        const int h = rowx/nrows_y; // head index
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								        const float base = h < n_head_log2 ? m0 : m1;
 								        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								        slope = powf(base, exp);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    }
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    extern __shared__ float data_soft_max_f32[];
 								    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
 								    // shared memory buffer to cache values between iterations:
 								    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												Fix CUDA softmax by subtracting max value before exp (#2665)


											
										
										
											2023-08-22 20:27:06 +02:00
+								    float max_val = -INFINITY;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
 								        if (ncols_template == 0 && col >= ncols) {
 								            break;
 								        }
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        const int ix = rowx*ncols + col;
 								        const int iy = rowy*ncols + col;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												cuda, metal : fix nans in soft_max (#5574)

* cuda : fix nans in soft_max

* metal : fix nans in soft_max

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-19 09:04:45 +01:00
+								        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								        vals[col] = val;
 								        max_val = max(max_val, val);
-												Fix CUDA softmax by subtracting max value before exp (#2665)


											
										
										
											2023-08-22 20:27:06 +02:00
+								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												Fix CUDA softmax by subtracting max value before exp (#2665)


											
										
										
											2023-08-22 20:27:06 +02:00
+								    // find the max value in the block
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								    max_val = warp_reduce_max(max_val);
 								    if (block_size > WARP_SIZE) {
 								        if (warp_id == 0) {
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								            buf_iw[lane_id] = -INFINITY;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        }
 								        __syncthreads();
 								        if (lane_id == 0) {
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								            buf_iw[warp_id] = max_val;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        }
 								        __syncthreads();
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								        max_val = buf_iw[lane_id];
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        max_val = warp_reduce_max(max_val);
-												Fix CUDA softmax by subtracting max value before exp (#2665)


											
										
										
											2023-08-22 20:27:06 +02:00
+								    }
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    float tmp = 0.0f; // partial sum
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
 								        if (ncols_template == 0 && col >= ncols) {
 								            break;
 								        }
 								        const float val = expf(vals[col] - max_val);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        tmp += val;
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								        vals[col] = val;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    }
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								    // find the sum of exps in the block
 								    tmp = warp_reduce_sum(tmp);
 								    if (block_size > WARP_SIZE) {
-												cuda : fix data race in soft max (#5853)


											
										
										
											2024-03-03 14:26:18 +01:00
+								        __syncthreads();
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        if (warp_id == 0) {
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								            buf_iw[lane_id] = 0.0f;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        }
 								        __syncthreads();
 								        if (lane_id == 0) {
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								            buf_iw[warp_id] = tmp;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        }
 								        __syncthreads();
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								        tmp = buf_iw[lane_id];
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								        tmp = warp_reduce_sum(tmp);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    }
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    const float inv_sum = 1.0f / tmp;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								#pragma unroll
 								    for (int col0 = 0; col0 < ncols; col0 += block_size) {
 								        const int col = col0 + tid;
 								        if (ncols_template == 0 && col >= ncols) {
 								            return;
 								        }
 								        const int idst = rowx*ncols + col;
 								        dst[idst] = vals[col] * inv_sum;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    }
 								}
 								static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = scale * x[i];
 								}
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
 								    const int i = blockDim.x*blockIdx.x + threadIdx.x;
 								    if (i >= k) {
 								        return;
 								    }
 								    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								template <typename T>
 								static  __global__ void im2col_kernel(
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        const float * x, T * dst, int64_t batch_offset,
 								        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								        int s0, int s1, int p0, int p1, int d0, int d1) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    if (i >= pelements) {
 								        return;
 								    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
 								    const int64_t  kx = i / ksize;
 								    const int64_t  kd = kx * ksize;
 								    const int64_t  ky = (i - kd) / OW;
 								    const int64_t  ix = i % OW;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int64_t  oh = blockIdx.y;
 								    const int64_t  batch = blockIdx.z / IC;
 								    const int64_t  ic = blockIdx.z % IC;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
-												sync : ggml (fix im2col) (#4591)

* cuda : fix im2col_f32_f16 (ggml/#658)

ggml-ci

* ggml-alloc : fix ggml_tallocr_is_own

---------

Co-authored-by: leejet <leejet714@gmail.com>
											
										
										
											2023-12-22 16:53:43 +01:00
+								    const int64_t iiw = ix * s0 + kx * d0 - p0;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    const int64_t iih = oh * s1 + ky * d1 - p1;
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
-												sync : ggml (fix im2col) (#4591)

* cuda : fix im2col_f32_f16 (ggml/#658)

ggml-ci

* ggml-alloc : fix ggml_tallocr_is_own

---------

Co-authored-by: leejet <leejet714@gmail.com>
											
										
										
											2023-12-22 16:53:43 +01:00
+								    const int64_t offset_dst =
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								        ((batch * OH + oh) * OW + ix) * CHW +
 								        (ic * (KW * KH) + ky * KW + kx);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
 								    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								        dst[offset_dst] = 0.0f;
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    } else {
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
 								        dst[offset_dst] = x[offset_src + iih * IW + iiw];
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    }
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								template <typename Ti, typename To>
 								static  __global__ void pool2d_nchw_kernel(
 								        const int ih, const int iw, const int oh, const int ow,
 								        const int kh, const int kw, const int sh, const int sw,
 								        const int ph, const int pw, const int parallel_elements,
 								        const Ti* src, To* dst, const enum ggml_op_pool op) {
 								        int idx = threadIdx.x + blockIdx.x * blockDim.x;
 								        if (idx >= parallel_elements) {
 								            return;
 								        }
 								        const int I_HW = ih * iw;
 								        const int O_HW = oh * ow;
 								        const int nc = idx / O_HW;
 								        const int cur_oh = idx % O_HW / ow;
 								        const int cur_ow = idx % O_HW % ow;
 								        const Ti* i_ptr = src + nc * I_HW;
 								        To* o_ptr = dst + nc * O_HW;
 								        const int start_h = cur_oh * sh - ph;
 								        const int bh = max(0, start_h);
 								        const int eh = min(ih, start_h + kh);
 								        const int start_w = cur_ow * sw - pw;
 								        const int bw = max(0, start_w);
 								        const int ew = min(iw, start_w + kw);
 								        const To scale = 1. / (kh * kw);
 								        To res = 0;
 								        switch (op) {
 								            case GGML_OP_POOL_AVG: res = 0; break;
 								            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
 								        }
 								        for (int i = bh; i < eh; i += 1) {
 								            for (int j = bw; j < ew; j += 1) {
 								    #if __CUDA_ARCH__ >= 350
 								                Ti cur = __ldg(i_ptr + i * iw + j);
 								    #else
 								                Ti cur = i_ptr[i * iw + j];
 								    #endif
 								                switch (op) {
 								                    case GGML_OP_POOL_AVG: res += cur * scale; break;
 								                    case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
 								                }
 								            }
 								        }
 								        o_ptr[cur_oh * ow + cur_ow] = res;
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								template<int qk, int qr, dequantize_kernel_t dq>
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
 								    GGML_TENSOR_BINARY_OP_LOCALS
 								    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
 								    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
 								    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
 								    // strides in elements
 								    //const size_t s0 = nb0 / ggml_element_size(dst);
 								    const size_t s1 = nb1 / ggml_element_size(dst);
 								    const size_t s2 = nb2 / ggml_element_size(dst);
 								    const size_t s3 = nb3 / ggml_element_size(dst);
 								    const size_t s10 = nb10 / ggml_element_size(src1);
 								    const size_t s11 = nb11 / ggml_element_size(src1);
 								    const size_t s12 = nb12 / ggml_element_size(src1);
 								    //const size_t s13 = nb13 / ggml_element_size(src1);
 								    GGML_ASSERT(ne00 % 2 == 0);
 								    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
 								            src0_dd, src1_dd, dst_dd,
 								            ne00, /*ne01, ne02, ne03,*/
 								            /*ne10, ne11,*/ ne12, /*ne13,*/
 								            /* s0,*/ s1, s2, s3,
 								            /* nb00,*/ nb01, nb02, nb03,
 								            s10, s11, s12/*, s13*/);
 								    (void) dst;
 								}
 								template<typename src0_t>
 								static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
 								    GGML_TENSOR_BINARY_OP_LOCALS
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
 								    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
 								    // strides in elements
 								    //const size_t s0 = nb0 / ggml_element_size(dst);
 								    const size_t s1 = nb1 / ggml_element_size(dst);
 								    const size_t s2 = nb2 / ggml_element_size(dst);
 								    const size_t s3 = nb3 / ggml_element_size(dst);
 								    const size_t s10 = nb10 / ggml_element_size(src1);
 								    const size_t s11 = nb11 / ggml_element_size(src1);
 								    const size_t s12 = nb12 / ggml_element_size(src1);
 								    //const size_t s13 = nb13 / ggml_element_size(src1);
 								    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
 								            src0_dd, src1_dd, dst_dd,
 								            ne00, /*ne01, ne02, ne03,*/
 								            /*ne10, ne11,*/ ne12, /*ne13,*/
 								            /* s0,*/ s1, s2, s3,
 								            /* nb00,*/ nb01, nb02, nb03,
 								            s10, s11, s12/*, s13*/);
 								    (void) dst;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								template<float (*bin_op)(const float, const float)>
 								struct bin_bcast_cuda {
 								    template<typename src0_t, typename src1_t, typename dst_t>
 								    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
 								            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
 								            cudaStream_t stream) {
 								        GGML_TENSOR_BINARY_OP_LOCALS
 								        int nr0 = ne10/ne0;
 								        int nr1 = ne11/ne1;
 								        int nr2 = ne12/ne2;
 								        int nr3 = ne13/ne3;
 								        int nr[4] = { nr0, nr1, nr2, nr3 };
 								        // collapse dimensions until first broadcast dimension
 								        int64_t cne0[] = {ne0, ne1, ne2, ne3};
 								        int64_t cne1[] = {ne10, ne11, ne12, ne13};
 								        size_t cnb0[] = {nb0, nb1, nb2, nb3};
 								        size_t cnb1[] = {nb10, nb11, nb12, nb13};
 								        auto collapse = [](int64_t cne[]) {
 								            cne[0] *= cne[1];
 								            cne[1] = cne[2];
 								            cne[2] = cne[3];
 								            cne[3] = 1;
 								        };
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            cnb[1] *= cne[1];
 								            cnb[2] *= cne[2];
 								            cnb[3] *= cne[3];
 								        };
 								        for (int i = 0; i < 4; i++) {
 								            if (nr[i] != 1) {
 								                break;
 								            }
 								            if (i > 0) {
 								                collapse_nb(cnb0, cne0);
 								                collapse_nb(cnb1, cne1);
 								                collapse(cne0);
 								                collapse(cne1);
 								            }
 								        }
 								        {
 								            int64_t ne0 = cne0[0];
 								            int64_t ne1 = cne0[1];
 								            int64_t ne2 = cne0[2];
 								            int64_t ne3 = cne0[3];
 								            int64_t ne10 = cne1[0];
 								            int64_t ne11 = cne1[1];
 								            int64_t ne12 = cne1[2];
 								            int64_t ne13 = cne1[3];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            size_t nb0 = cnb0[0];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            size_t nb1 = cnb0[1];
 								            size_t nb2 = cnb0[2];
 								            size_t nb3 = cnb0[3];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            size_t nb10 = cnb1[0];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            size_t nb11 = cnb1[1];
 								            size_t nb12 = cnb1[2];
 								            size_t nb13 = cnb1[3];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								            size_t s0 = nb0 / sizeof(dst_t);
 								            size_t s1 = nb1 / sizeof(dst_t);
 								            size_t s2 = nb2 / sizeof(dst_t);
 								            size_t s3 = nb3 / sizeof(dst_t);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            size_t s10 = nb10 / sizeof(src1_t);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            size_t s11 = nb11 / sizeof(src1_t);
 								            size_t s12 = nb12 / sizeof(src1_t);
 								            size_t s13 = nb13 / sizeof(src1_t);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            GGML_ASSERT(s0 == 1);
 								            GGML_ASSERT(s10 == 1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								            const int block_size = 128;
 								            int64_t hne0 = std::max(ne0/2LL, 1LL);
 								            dim3 block_dims;
 								            block_dims.x = std::min<unsigned int>(hne0, block_size);
 								            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
 								            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
 								            dim3 block_nums(
 								                (hne0 + block_dims.x - 1) / block_dims.x,
 								                (ne1 + block_dims.y - 1) / block_dims.y,
 								                (ne2*ne3 + block_dims.z - 1) / block_dims.z
 								            );
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 12:49:04 +01:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            if (block_nums.z > 65535) {
 								                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
 								                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
 								                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
 								                    src0_dd, src1_dd, dst_dd,
 								                    ne0, ne1, ne2, ne3,
 								                    ne10, ne11, ne12, ne13,
 								                    /* s0, */ s1, s2, s3,
 								                    /* s10, */ s11, s12, s13);
 								            } else {
 								                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
 								                    src0_dd, src1_dd, dst_dd,
 								                    ne0, ne1, ne2, ne3,
 								                    ne10, ne11, ne12, ne13,
 								                    /* s0, */ s1, s2, s3,
 								                    /* s10, */ s11, s12, s13);
 								            }
 								        }
 								    }
 								};
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
 								    const int ne10, const int ne11, const int ne12,
 								    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
 								    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
 								    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
 								}
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
 								    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
 								    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
 								    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
 								static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
 								    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
 								    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
 								    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
 								static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
 								    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
 								    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
 								    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								    GGML_ASSERT(ncols % WARP_SIZE == 0);
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    if (ncols < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    } else {
 								        const dim3 block_dims(1024, 1, 1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    }
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
 								    static const float eps = 1e-6f;
 								    if (group_size < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
 								        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
 								    } else {
 								        const dim3 block_dims(1024, 1, 1);
 								        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
 								    }
 								}
 								static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
 								    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, ne1, ne2);
 								    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
 								                             const int scale_factor, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int ne0 = (ne00 * scale_factor);
 								    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
 								}
 								static void pad_f32_cuda(const float * x, float * dst,
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    const int ne00, const int ne01, const int ne02, const int ne03,
 								    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    dim3 gridDim(num_blocks, ne1, ne2*ne3);
 								    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 								}
 								static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
 								    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
 								    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
 								}
 								static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
 								                                        const int dim, const int max_period, cudaStream_t stream) {
 								    int half_ceil = (dim + 1) / 2;
 								    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
 								    dim3 gridDim(num_blocks, ne00, 1);
 								    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								}
-												make rms_norm_eps a parameter (#2374)

* make rms_norm_eps a parameter

* add rms_norm_eps to command line

* fix baby llama, test-grad0

* use scientific notation for eps param in the help

ggml-ci
											
										
										
											2023-07-24 17:57:12 +02:00
+								static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    GGML_ASSERT(ncols % WARP_SIZE == 0);
-x faster (rms) norm cuda kernels (3.7% e2e improvement) (#2985)

* 2x faster (rms) norm cuda kernels

* Fix code style
											
										
										
											2023-09-04 08:53:30 +02:00
+								    if (ncols < 1024) {
 								        const dim3 block_dims(WARP_SIZE, 1, 1);
 								        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 								    } else {
 								        const dim3 block_dims(1024, 1, 1);
 								        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
 								    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
 								    const dim3 num_blocks(block_num_x, ky, 1);
 								    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
 								    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 								static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-												ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
											
										
										
											2023-04-25 22:40:51 +02:00
+								}
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
 								    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
 								        const bool need_check = false;
 								        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
 								    } else {
 								        const bool need_check = true;
 								        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
 								    }
 								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int nb = k / QK_K;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int nb = k / QK_K;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 06:48:06 +01:00
+								template<typename dst_t>
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 06:48:06 +01:00
+								    const int nb32 = k / 32;
 								    const int nb = (k + 255) / 256;
 								    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
 								}
 								template<typename dst_t>
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-15 06:48:06 +01:00
+								    const int nb32 = k / 32;
 								    const int nb = (k + 255) / 256;
 								    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
 								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int nb = k / QK_K;
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int nb = k / QK_K;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								template<typename dst_t>
 								static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const int nb = k / QK_K;
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#if QK_K == 256
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
-												k-quants : support for super-block size of 64 (#2001)

* k_quants: WIP super-blocks with 64 weights

* k_quants: WIP super-blocks with 64 weights

Q6_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q4_K scalar and AVX2 works

* k_quants: WIP super-blocks with 64 weights

Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower
than the scalar implementation)

* k_quants: WIP super-blocks with 64 weights

Q3_K scalar and AVX2 works.

* k_quants: WIP super-blocks with 64 weights

Q5_K scalar and AVX2 works, and with that all
k_quants are done on AVX2 and scalar

* k_quants: WIP super-blocks with 64 weights

Q6_K working on CUDA. Cannot make it run quite as gast as
with super-blocks with 256 weigths: 8% slower on 4080,
20% slower on the 1660 (but there we fit 1 less layer on the
GPU because pf the larger model size), so some fraction of
these 20% is due to that,

* k_quants: WIP super-blocks with 64 weights

Q4_K working on CUDA. ~10% slower on GTX-1660,
16% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.

* k_quants: WIP super-blocks with 64 weights

Q3_K working on CUDA.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on CUDA, and with this CUDA is done.

* k_quants: WIP super-blocks with 64 weights

Q6_K working on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Q4_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q2_K working on ARM_NEON, but quite a bit slower than 256 weights

* k_quants: WIP super-blocks with 64 weights

Q3_K working on ARM_NEON, but quite a bit slower than 256 weights.

* k_quants: WIP super-blocks with 64 weights

Q5_K working on ARM_NEON, but quite a bit slower than 256 weights.

With that, we have full support for ARM_NEON, although
performance is not quite there.

* k_quants: WIP super-blocks with 64 weights

Slightly more efficient Q3_K and Q5_K

* k_quants: WIP super-blocks with 64 weights

Another small improvement for Q3_K and Q5_K on ARM_NEON

* k_quants: WIP super-blocks with 64 weights

Yet another speedup for Q5_K on ARM_NEON.
We are now within 10% of the QK_K = 256 version.

* k_quants: WIP super-blocks with 64 weights

* We are able to pass preprocessor macros to the Metal
  compiler
* Q6_K works and is actually slightly more efficient than
  the QK_K = 256 version (25.2 ms vs 25.8 ms)

* k_quants: WIP super-blocks with 64 weights

Q4_K works on Metal and is actually slightly faster
than QK_K = 256 (21.95 ms vs 24.0 ms).

* k_quants: WIP super-blocks with 64 weights

Q2_K works on Metal and is very slightly faster
than QK_K = 256 (23.8 ms vs 24.2 ms).

* k_quants: WIP super-blocks with 64 weights

Q3_K works on Metal and is slightly faster
than QK_K = 256 (26.6 ms vs 28.3 ms).

* k_quants: WIP super-blocks with 64 weights

Q5_K works on Metal and is slightly faster
than QK_K = 256 (23.7 ms vs 26.3 ms).

* k_quants: call them _K, not _k, also on Metal

* k_quants: correctly define QK_K in llama.cpp

* Fixed bug in q4_K quantization added with the 64-block addition

* Simplify via lambda

* k_quants: swicth Q3_K to 4-bit scales when QK_K = 64

Otherwise there isn't much benefit from this
quantization type. There is some very slight loss
in accuracy, but we reduce size by ~7%.
E.g., for OpenLLaMA-3B, Q3_K_S perplexity is
8.6131 with 8-bit scales and 8.6352 with 4-bit,
while file size decreases from 1.53G to 1.44G.

* k_quants: switch Q4_K to 4-bit scales when QK_K = 64

 Here the loss in accuracy is greater than for Q3_K,
 but the Q4_K points still move further to the left on
 the perplexity vs size curve.

* k_quants: forgot to add the Metal changes in last commit

* k_quants: change Q5_K to be type 0 when QK_K = 64

Still needs AVX2 implementation

* k_quants: AVX2 implementation for new 64-weight Q5_K

* k_quants: 10% faster ARM_NEON Q5_K dot product

* k_quants: fixed issue caused by merging with master

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-26 18:43:07 +02:00
+								#else
 								    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
 								#endif
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
 								}
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = k / QK_K;
 								    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = (k + QK_K - 1) / QK_K;
 								    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 								}
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								template<typename dst_t>
 								static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
 								    const int nb = (k + QK_K - 1) / QK_K;
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-28 09:37:02 +01:00
+								#if QK_K == 64
 								    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 								#else
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-28 09:37:02 +01:00
+								#endif
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								}
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								template <typename src_t, typename dst_t>
 								static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
 								    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								    int id;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    switch (type) {
 								        case GGML_TYPE_Q4_0:
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								            return dequantize_row_q4_0_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_Q4_1:
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								            return dequantize_row_q4_1_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_Q5_0:
 								            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
 								        case GGML_TYPE_Q5_1:
 								            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
 								        case GGML_TYPE_Q8_0:
-												CUDA: faster q8_0 -> f16 dequantization (#4895)


											
										
										
											2024-01-12 20:38:54 +01:00
+								            CUDA_CHECK(cudaGetDevice(&id));
 								            if (g_device_caps[id].cc >= CC_PASCAL) {
 								                return dequantize_block_q8_0_f16_cuda;
 								            }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
 								        case GGML_TYPE_Q2_K:
 								            return dequantize_row_q2_K_cuda;
 								        case GGML_TYPE_Q3_K:
 								            return dequantize_row_q3_K_cuda;
 								        case GGML_TYPE_Q4_K:
 								            return dequantize_row_q4_K_cuda;
 								        case GGML_TYPE_Q5_K:
 								            return dequantize_row_q5_K_cuda;
 								        case GGML_TYPE_Q6_K:
 								            return dequantize_row_q6_K_cuda;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        case GGML_TYPE_IQ2_XXS:
 								            return dequantize_row_iq2_xxs_cuda;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        case GGML_TYPE_IQ2_XS:
 								            return dequantize_row_iq2_xs_cuda;
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								        case GGML_TYPE_IQ2_S:
 								            return dequantize_row_iq2_s_cuda;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        case GGML_TYPE_IQ3_XXS:
 								            return dequantize_row_iq3_xxs_cuda;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								        case GGML_TYPE_IQ1_S:
 								            return dequantize_row_iq1_s_cuda;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								        case GGML_TYPE_IQ4_NL:
 								            return dequantize_row_iq4_nl_cuda;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								        case GGML_TYPE_IQ4_XS:
 								            return dequantize_row_iq4_xs_cuda;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        case GGML_TYPE_IQ3_S:
 								            return dequantize_row_iq3_s_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_F32:
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								            return convert_unary_cuda<float>;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        default:
 								            return nullptr;
 								    }
 								}
 								static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 								    switch (type) {
 								        case GGML_TYPE_Q4_0:
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								            return dequantize_row_q4_0_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_Q4_1:
-												cuda : fix dequantize kernel names (#4938)

											
										
										
											2024-01-15 12:27:00 +01:00
+								            return dequantize_row_q4_1_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_Q5_0:
 								            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
 								        case GGML_TYPE_Q5_1:
 								            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
 								        case GGML_TYPE_Q8_0:
 								            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
 								        case GGML_TYPE_Q2_K:
 								            return dequantize_row_q2_K_cuda;
 								        case GGML_TYPE_Q3_K:
 								            return dequantize_row_q3_K_cuda;
 								        case GGML_TYPE_Q4_K:
 								            return dequantize_row_q4_K_cuda;
 								        case GGML_TYPE_Q5_K:
 								            return dequantize_row_q5_K_cuda;
 								        case GGML_TYPE_Q6_K:
 								            return dequantize_row_q6_K_cuda;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        case GGML_TYPE_IQ2_XXS:
 								            return dequantize_row_iq2_xxs_cuda;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        case GGML_TYPE_IQ2_XS:
 								            return dequantize_row_iq2_xs_cuda;
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								        case GGML_TYPE_IQ2_S:
 								            return dequantize_row_iq2_s_cuda;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        case GGML_TYPE_IQ3_XXS:
 								            return dequantize_row_iq3_xxs_cuda;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								        case GGML_TYPE_IQ1_S:
 								            return dequantize_row_iq1_s_cuda;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								        case GGML_TYPE_IQ4_NL:
 								            return dequantize_row_iq4_nl_cuda;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								        case GGML_TYPE_IQ4_XS:
 								            return dequantize_row_iq4_xs_cuda;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        case GGML_TYPE_IQ3_S:
 								            return dequantize_row_iq3_s_cuda;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_TYPE_F16:
-												CUDA: fixed redundant value dequantization (#4809)


											
										
										
											2024-01-07 17:24:08 +01:00
+								            return convert_unary_cuda<half>;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        default:
 								            return nullptr;
 								    }
 								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
 								    const dim3 block_nums(block_num_y, 1, 1);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
+								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-												cuda : performance optimizations (#1530)

* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
											
										
										
											2023-05-25 23:07:29 +02:00
+								    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    GGML_ASSERT(ncols % QK_K == 0);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const int block_num_y = (nrows + ny - 1) / ny;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    const dim3 block_dims(32, ny, 1);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    GGML_ASSERT(ncols % QK_K == 0);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int ny = 2 / K_QUANTS_PER_ITERATION;
 								    const int block_num_y = (nrows + ny - 1) / ny;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    GGML_ASSERT(ncols % QK_K == 0);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const int ny = 2 / K_QUANTS_PER_ITERATION;
 								    const int block_num_y = (nrows + ny - 1) / ny;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												cuda : faster k-quants on older GPUs (#1930)

* k_quants: hopefully much faster Q4_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 65.5 ms/tok
to 41.5 ms/tok!

* k_quants: hopefully much faster Q3_K on older GPUs

On the GTX-1660 that I have available to represent
"old GPUs", token prediction drops from 60.3 ms/tok
to 41.0 ms/tok!

* k_quants: faster Q2_K on older GPUs

It looks like I didn't need to change anything
compared to what we already had, so this is just
adding clarifying comments. But I now measure
36.3 ms/tok on the GTX-1660, instead fo the
47.2 ms/tok that I have written in the faster
k-quants PR.

* k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-19 17:14:09 +02:00
+								    const dim3 block_dims(32, ny, 1);
 								    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    GGML_ASSERT(ncols % QK_K == 0);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const dim3 block_dims(32, 1, 1);
 								    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												k-quants : allow to optionally disable at compile time (#1734)

* k-quants : put behind optional compile flag LLAMA_K_QUANTS

* build : enable k-quants by default
											
										
										
											2023-06-07 09:59:52 +02:00
+								static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								    GGML_ASSERT(ncols % QK_K == 0);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    const int ny = 2 / K_QUANTS_PER_ITERATION;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const int block_num_y = (nrows + ny - 1) / ny;
-												ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921)


											
										
										
											2023-11-03 12:13:09 +01:00
+								    const dim3 block_nums(block_num_y, 1, 1);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const dim3 block_dims(32, ny, 1);
-												CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-06-16 19:08:44 +02:00
+								    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-												ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)

* Starting to add k-quantization to ggml

I think it is better to have quantization separate from
ggml. For now just adding the k-quants there, but it would be
better to also factor out the existing ggml quantizations.

* Adding Q3_K and Q8_K (de)-quantization

* Q3_K now working on CUDA and AVX2/scalar

CUDA is not ideal - ~50% slower than Q4_0 for
single token prediction, about the same in batch
mode (perplexity). CPU single token is ~55 ms
(on Ryzen 7950X).

* Some improvement for Q3_K on CUDA

It is now ~22.5 ms/token on my GPU, so ~30% slower than Q4_0.

* Some more CUDA optimizations for Q3_K

Single token is now 20.5 ms/token (~20% slower than Q4_0).
Perplexity is on par with Q4_0.

* Adding Q4_K - scalar, AVX2, CUDA

Performance is the same or perhaps very slightly better than Q4_0 on the CPU.
On the GPU, single token prediction is ~10% better than Q4_0,
batch mode (perplexity is about the same).

* Adding Q6_K - scalar, AVX2, CUDA

Performance is ~40% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 6-bit model is ~44% larger than the 4-bit.
On the GPU, single token prediction is ~6% lower than Q4_0,
batch mode (perplexity) is even closer (but still slower).

* Adding Q5_K - scalar, AVX2, CUDA

Performance is ~20% lower compared to Q4_K on the CPU.
This is to be expected, considering that we are memory bound
on the CPU and the 5-bit model is ~22% larger than the 4-bit.
On the GPU, single token prediction is about the same as Q4_0
for both, single token and batch prediction.

* Per convention, all QX_K quantizations use Q5_K for output.weight

* Adding quantization mixes

* Quantization mixes: didn't quite get what I wanted in the last commit

* Q4_K dot product for ARM_NEON

* Q6_K dot product for ARM_NEON

* Q5_K dot product for ARM_NEON

* Adding Q3_K dot for ARM_NEON

It is 22% slower than Q4_K, despite the smaller model size.
On x86_64, where we are memory bound, the Q3_K model is
quite a bit faster than Q4_K.

* A very slightly faster ARM_NEON Q3_K dot

* Adding Q2_K - just CUDA for now

Token prediction is pretty good - about 15.5 ms on a RTX 4080.
Perplexity is about the same as Q4_K.

* Adding scalar and AVX2 Q2_K dot

* Adding ARM_NEON Q2_K dot

About the same performance as Q4_K.

* A slightly faster ARM_NEON Q2_K dot

Single token prediction is now ~36 ms on M2 Max.
The code is much simpler too.

* Fixed bug in Q2_K CUDA dot product kernel

Stranegly enough, for the few prompts I tried with the 7B model
the responses looked perfectly reasonable. Only realized something
is not quite right when I tried the larger models and started getting
nonse back.

In any case, Q2_K single token evaluation time on an RTX 4080 in a Ryzen7950X
box iusing CUDA and model fully loaded on the GPU are
  ~15.5 ms for 7B, ~25.4 ms for 13B, and ~55.8 ms for 30B.
The max number of layers that fit in VRAM for The 65B is 32.
With that, we get ~330 ms per token, which is not that much faster
than just running on the CPU (~470 ms per token).

* Don't print zeros/NaNs when no count histogram has been collected

* A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA,
so the gap to Q4_0 is only 10%.
It seems memory acccess pattern is more important for
performance than the amount of computation the kernel
does.

* A slightly daster Q4_K AVX2 dot product

For perplexity, where we are less memory bound, time per
pass drops by ~5%. Barely measurable difference for single
token prediction.

* A slightly faster ARM_NEON A4_K dot product

* Minor

* Fix quantization error test

We cannot possibly be expecting rmse < 0.002 for 2- and 3-bit
quantization variants.

* Fix docker build

I have been sloppy with vector reinterpret casts on ARM_NEON.
It seems clang is very forgiving in that regard.

* Added forgotten ggml.o dependence on k_quants.h to the Makefile

* Had unintentionally committed the Makefile with -Ofast enabled

* ggml : rename k_quants -> ggml-quants-k, use lowercase in code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-06-05 21:56:18 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
 								    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
 								    const dim3 block_nums(block_num_y, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
 								    dequantize_mul_mat_vec<1, 1, convert_f16>
 								        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 								}
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
 								static void mul_mat_vec_q_cuda(
 								    const void * vx, const void * vy, float * dst,
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								    GGML_ASSERT(ncols_x % qk == 0);
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    int64_t nwarps = 1;
 								    int64_t rows_per_cuda_block = 1;
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
 								        switch(ncols_y) {
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								            case 1:
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								                nwarps = 4;
 								                rows_per_cuda_block = 1;
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								                break;
 								            case 2:
 								            case 3:
 								            case 4:
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								                nwarps = 4;
 								                rows_per_cuda_block = 2;
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								                break;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								            case 5:
 								            case 6:
 								            case 7:
 								            case 8:
 								                nwarps = 2;
 								                rows_per_cuda_block = 2;
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
+								                break;
 								            default:
 								                GGML_ASSERT(false);
 								                break;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        }
 								    }
 								    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
 								    const dim3 block_nums(nblocks, 1, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												CUDA: more warps for mmvq on NVIDIA (#5394)


											
										
										
											2024-02-08 21:56:40 +01:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    switch (ncols_y) {
 								        case 1:
 								            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 2:
 								            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 3:
 								            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 4:
 								            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 5:
 								            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 6:
 								            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 7:
 								            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
 								        case 8:
 								            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
 								                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 								            break;
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								}
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								static void ggml_mul_mat_q4_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
 								        nwarps = NWARPS_Q4_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
 								        nwarps = NWARPS_Q4_0_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q4_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
 								        nwarps = NWARPS_Q4_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
 								        nwarps = NWARPS_Q4_0_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q4_1_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_1_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
 								        nwarps = NWARPS_Q4_1_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_1_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
 								        nwarps = NWARPS_Q4_1_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q4_1_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
 								        nwarps = NWARPS_Q4_1_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_1_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
 								        nwarps = NWARPS_Q4_1_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q5_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
 								        nwarps = NWARPS_Q5_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
 								        nwarps = NWARPS_Q5_0_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q5_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
 								        nwarps = NWARPS_Q5_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
 								        nwarps = NWARPS_Q5_0_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q5_1_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_1_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
 								        nwarps = NWARPS_Q5_1_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_1_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
 								        nwarps = NWARPS_Q5_1_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q5_1_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
 								        nwarps = NWARPS_Q5_1_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_1_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
 								        nwarps = NWARPS_Q5_1_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q8_0_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q8_0_RDNA2;
 								        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
 								        nwarps = NWARPS_Q8_0_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q8_0_RDNA1;
 								        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
 								        nwarps = NWARPS_Q8_0_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q8_0_AMPERE;
 								        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
 								        nwarps = NWARPS_Q8_0_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q8_0_PASCAL;
 								        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
 								        nwarps = NWARPS_Q8_0_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q2_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q2_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
 								        nwarps = NWARPS_Q2_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q2_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
 								        nwarps = NWARPS_Q2_K_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q2_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
 								        nwarps = NWARPS_Q2_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q2_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
 								        nwarps = NWARPS_Q2_K_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q3_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#if QK_K == 256
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q3_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
 								        nwarps = NWARPS_Q3_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q3_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
 								        nwarps = NWARPS_Q3_K_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q3_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
 								        nwarps = NWARPS_Q3_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q3_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
 								        nwarps = NWARPS_Q3_K_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												k_quants tuning for Falcon-7b (#2816)

* Make ggml-cuda.cu build with QK_K = 64

Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces
a meaningful result.

* k_quants tuning for Falcon-7b

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-08-27 14:19:59 +02:00
+								#endif
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q4_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q4_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
 								        nwarps = NWARPS_Q4_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q4_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
 								        nwarps = NWARPS_Q4_K_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q4_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
 								        nwarps = NWARPS_Q4_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q4_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
 								        nwarps = NWARPS_Q4_K_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q5_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q5_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
 								        nwarps = NWARPS_Q5_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q5_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
 								        nwarps = NWARPS_Q5_K_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q5_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
 								        nwarps = NWARPS_Q5_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q5_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
 								        nwarps = NWARPS_Q5_K_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
 								static void ggml_mul_mat_q6_K_q8_1_cuda(
 								    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
 								    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								    int mmq_x, mmq_y, nwarps;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    if (compute_capability >= CC_RDNA2) {
 								        mmq_x  =  MMQ_X_Q6_K_RDNA2;
 								        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
 								        nwarps = NWARPS_Q6_K_RDNA2;
 								    } else if (compute_capability >= CC_OFFSET_AMD) {
 								        mmq_x  =  MMQ_X_Q6_K_RDNA1;
 								        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
 								        nwarps = NWARPS_Q6_K_RDNA1;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								    } else if (compute_capability >= CC_VOLTA) {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        mmq_x  =  MMQ_X_Q6_K_AMPERE;
 								        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
 								        nwarps = NWARPS_Q6_K_AMPERE;
 								    } else if (compute_capability >= MIN_CC_DP4A) {
 								        mmq_x  =  MMQ_X_Q6_K_PASCAL;
 								        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
 								        nwarps = NWARPS_Q6_K_PASCAL;
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    } else {
-												CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)


											
										
										
											2023-08-13 00:24:45 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 								    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
 								    const dim3 block_nums(block_num_x, block_num_y, 1);
 								    const dim3 block_dims(WARP_SIZE, nwarps, 1);
 								    if (nrows_x % mmq_y == 0) {
 								        const bool need_check = false;
 								        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 								    } else {
 								        const bool need_check = true;
 								        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
 								            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								    }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								static void ggml_mul_mat_p021_f16_f32_cuda(
 								    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
 								    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
 								    const dim3 block_nums(1, nrows_x, nchannels_y);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const dim3 block_dims(WARP_SIZE, 1, 1);
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
 								static void ggml_mul_mat_vec_nc_f16_f32_cuda(
 								    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const dim3 block_nums(1, nrows_x, nchannels_y);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const dim3 block_dims(WARP_SIZE, 1, 1);
 								    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
 								static void ggml_cpy_f16_f32_cuda(
 								    const char * cx, char * cdst, const int ne,
 								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								static void ggml_cpy_f32_f32_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
 								static void ggml_cpy_f32_f16_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								static void ggml_cpy_f32_q8_0_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
 								    GGML_ASSERT(ne % QK8_0 == 0);
 								    const int num_blocks = ne / QK8_0;
 								    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								}
 								static void ggml_cpy_f32_q4_0_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
 								    GGML_ASSERT(ne % QK4_0 == 0);
 								    const int num_blocks = ne / QK4_0;
 								    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								}
 								static void ggml_cpy_f32_q4_1_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
 								    GGML_ASSERT(ne % QK4_1 == 0);
 								    const int num_blocks = ne / QK4_1;
 								    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								}
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								static void ggml_cpy_f16_f16_cuda(
 								    const char * cx, char * cdst, const int ne,
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 								    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
 								    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 								    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								}
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
 								    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 								}
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
 								    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
 								    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								template<typename T>
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static void rope_cuda(
 								    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 								) {
-												CUDA: fix RoPE asserts, block sizes (#2833)


											
										
										
											2023-08-28 13:23:55 +02:00
+								    GGML_ASSERT(ncols % 2 == 0);
 								    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const dim3 block_nums(nrows, num_blocks_x, 1);
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    if (pos == nullptr) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
 								        );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    } else {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
 								            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
 								        );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    }
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								template<typename T>
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static void rope_neox_cuda(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 								) {
-												CUDA: fix RoPE asserts, block sizes (#2833)


											
										
										
											2023-08-28 13:23:55 +02:00
+								    GGML_ASSERT(ncols % 2 == 0);
 								    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
+								    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
 								    const dim3 block_nums(nrows, num_blocks_x, 1);
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
 								    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 								    const float inv_ndims = -1.0f / n_dims;
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    if (pos == nullptr) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
 								            theta_scale, inv_ndims
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    } else {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
 								            theta_scale, inv_ndims
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    }
-												cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)

* cuda : add RoPE kernel for mode == 2 (NeoX)

* falcon : do not offload the embeddings layer
											
										
										
											2023-08-25 10:55:59 +02:00
+								}
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								static void rope_glm_f32_cuda(
 								    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
 								    float freq_base, int n_ctx, cudaStream_t stream
 								) {
-												sync : ggml (CUDA GLM RoPE + POSIX) (#3082)

ggml-ci
											
										
										
											2023-09-08 16:58:07 +02:00
+								    GGML_ASSERT(ncols % 4 == 0);
 								    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
 								    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    const dim3 block_nums(num_blocks_x, nrows, 1);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								}
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
 								                           const int k_rows, const int n_heads_log2_floor, const float m0,
 								                           const float m1, cudaStream_t stream) {
 								    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
 								    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
 								    const dim3 block_nums(num_blocks_x, nrows, 1);
 								    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 								    const dim3 block_dims(WARP_SIZE, 1, 1);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    const dim3 block_nums(nrows, 1, 1);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 								}
 								static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
 								    // bitonic sort requires ncols to be power of 2
 								    GGML_ASSERT((ncols & (ncols - 1)) == 0);
 								    const dim3 block_dims(ncols, 1, 1);
 								    const dim3 block_nums(1, nrows, 1);
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (order == GGML_SORT_ORDER_ASC) {
 								        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 								    } else if (order == GGML_SORT_ORDER_DESC) {
 								        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    } else {
 								        GGML_ASSERT(false);
 								    }
 								}
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const dim3 block_nums(nrows_x, block_num_x, 1);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
 								}
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								    int nth = WARP_SIZE;
 								    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
 								    const dim3 block_dims(nth,     1, 1);
-												ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-22 15:25:19 +02:00
+								    const dim3 block_nums(nrows_x, 1, 1);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
 								    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
 								    const uint32_t n_head_kv   = nrows_x/nrows_y;
 								    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
 								    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    if (shmem < g_device_caps[g_main_device].smpb) {
 								        switch (ncols_x) {
 								            case 32:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 64:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 128:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 256:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 512:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 1024:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 2048:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            case 4096:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								            default:
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								                break;
 								        }
 								    } else {
 								        const size_t shmem_low = WARP_SIZE*sizeof(float);
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								template <typename T>
 								static void im2col_cuda(const float* x, T* dst,
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
 								    int64_t batch, int64_t batch_offset, int64_t offset_delta,
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
 								    const int parallel_elements = OW * KW * KH;
 								    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    dim3 block_nums(num_blocks, OH, batch * IC);
 								    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								}
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								// buffer pool for cuda
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								#define MAX_CUDA_BUFFERS 256
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
 								struct scoped_spin_lock {
 								    std::atomic_flag& lock;
 								    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
 								        while (lock.test_and_set(std::memory_order_acquire)) {
 								            ; // spin
 								        }
 								    }
 								    ~scoped_spin_lock() {
 								        lock.clear(std::memory_order_release);
 								    }
 								    scoped_spin_lock(const scoped_spin_lock&) = delete;
 								    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
 								};
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 								// #define DEBUG_CUDA_MALLOC
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								struct ggml_cuda_buffer {
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    void * ptr = nullptr;
 								    size_t size = 0;
 								};
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    scoped_spin_lock lock(g_cuda_pool_lock);
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								#ifdef DEBUG_CUDA_MALLOC
 								    int nnz = 0;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    size_t max_size = 0;
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								#endif
 								    size_t best_diff = 1ull << 36;
 								    int ibest = -1;
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								        if (b.ptr != nullptr) {
 								#ifdef DEBUG_CUDA_MALLOC
 								            ++nnz;
 								            if (b.size > max_size) max_size = b.size;
 								#endif
 								            if (b.size >= size) {
 								                size_t diff = b.size - size;
 								                if (diff < best_diff) {
 								                    best_diff = diff;
 								                    ibest = i;
 								                    if (!best_diff) {
 								                        void * ptr = b.ptr;
 								                        *actual_size = b.size;
 								                        b.ptr = nullptr;
 								                        b.size = 0;
 								                        return ptr;
 								                    }
 								                }
 								            }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								        }
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
+								    }
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								    if (ibest >= 0) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								        void * ptr = b.ptr;
 								        *actual_size = b.size;
 								        b.ptr = nullptr;
 								        b.size = 0;
 								        return ptr;
 								    }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    void * ptr;
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								    size_t look_ahead_size = (size_t) (1.05 * size);
 								    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(device);
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
+								    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
 								    *actual_size = look_ahead_size;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    g_cuda_pool_size[device] += look_ahead_size;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#ifdef DEBUG_CUDA_MALLOC
-												CUDA: fix DEBUG_CUDA_MALLOC (#5729)


											
										
										
											2024-02-26 15:36:38 +01:00
+								    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
 								            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#endif
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    return ptr;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    scoped_spin_lock lock(g_cuda_pool_lock);
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								        if (b.ptr == nullptr) {
 								            b.ptr = ptr;
 								            b.size = size;
 								            return;
 								        }
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
+								    }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(device);
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								    CUDA_CHECK(cudaFree(ptr));
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    g_cuda_pool_size[device] -= size;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								}
 								#if !defined(GGML_USE_HIPBLAS)
 								// pool with virtual memory
 								static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 								static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-												cuda: fix vmm oom issue on NVIDIA AGX Orin (#4687)

Signed-off-by: hydai <hydai@secondstate.io>
											
										
										
											2023-12-29 17:31:19 +01:00
+								static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    scoped_spin_lock lock(g_cuda_pool_lock);
 								    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
 								    const size_t alignment = 128;
 								    size = alignment * ((size + alignment - 1) / alignment);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								    if (size > avail) {
 								        // round up to the next multiple of the granularity
 								        size_t reserve_size = size - avail;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        const size_t granularity = g_device_caps[device].vmm_granularity;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								        // allocate more physical memory
 								        CUmemAllocationProp prop = {};
 								        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 								        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        prop.location.id = device;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        CUmemGenericAllocationHandle handle;
 								        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
 								        // reserve virtual address space (if not already reserved)
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        if (g_cuda_pool_addr[device] == 0) {
 								            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        }
 								        // map at the end of the pool
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
 								        // the memory allocation handle is no longer needed after mapping
 								        CU_CHECK(cuMemRelease(handle));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								        // set access
 								        CUmemAccessDesc access = {};
 								        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        access.location.id = device;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								        // add to the pool
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        g_cuda_pool_size[device] += reserve_size;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
 								        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
 								        //       (unsigned long long) (reserve_size/1024/1024));
 								    }
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    GGML_ASSERT(g_cuda_pool_addr[device] != 0);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    *actual_size = size;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    g_cuda_pool_used[device] += size;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								#ifdef DEBUG_CUDA_MALLOC
-												CUDA: fix DEBUG_CUDA_MALLOC (#5729)


											
										
										
											2024-02-26 15:36:38 +01:00
+								    printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#endif
 								    return ptr;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    scoped_spin_lock lock(g_cuda_pool_lock);
 								#ifdef DEBUG_CUDA_MALLOC
-												CUDA: fix DEBUG_CUDA_MALLOC (#5729)


											
										
										
											2024-02-26 15:36:38 +01:00
+								    printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#endif
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    g_cuda_pool_used[device] -= size;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
 								    // all deallocations must be in reverse order of the allocations
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
 								    if (g_device_caps[device].vmm) {
 								        return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    } else {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        return ggml_cuda_pool_malloc_leg(device, size, actual_size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    }
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
 								    if (g_device_caps[device].vmm) {
 								        ggml_cuda_pool_free_vmm(device, ptr, size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    } else {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_pool_free_leg(device, ptr, size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    }
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								}
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								#else
 								#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
 								#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
 								#endif // !defined(GGML_USE_HIPBLAS)
 								template<typename T>
 								struct cuda_pool_alloc {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    int device = -1;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    T * ptr = nullptr;
 								    size_t actual_size = 0;
 								    // size is in number of elements
 								    T * alloc(size_t size) {
 								        GGML_ASSERT(ptr == nullptr);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        CUDA_CHECK(cudaGetDevice(&device));
 								        ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        return ptr;
 								    }
 								    cuda_pool_alloc(size_t size) {
 								        alloc(size);
 								    }
 								    ~cuda_pool_alloc() {
 								        if (ptr != nullptr) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            ggml_cuda_pool_free(device, ptr, actual_size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        }
 								    }
 								    T * get() {
 								        return ptr;
 								    }
 								    cuda_pool_alloc() = default;
 								    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
 								    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
 								    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
 								    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
 								};
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
+								static bool g_cublas_loaded = false;
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL bool ggml_cublas_loaded(void) {
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
+								    return g_cublas_loaded;
 								}
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void ggml_init_cublas() {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    static bool initialized = false;
 								    if (!initialized) {
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
 								#ifdef __HIP_PLATFORM_AMD__
 								        // Workaround for a rocBLAS bug when using multiple graphics cards:
 								        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
 								        rocblas_initialize();
 								        CUDA_CHECK(cudaDeviceSynchronize());
 								#endif
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
+								        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
 								            initialized = true;
 								            g_cublas_loaded = false;
-												cuda : print message when initialization fails (#5512)

* cuda : print message when initialization fails

* use CUDA_NAME both times
											
										
										
											2024-02-15 16:49:01 +01:00
+								            fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
+								            return;
 								        }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
 								        int64_t total_vram = 0;
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								#if defined(GGML_CUDA_FORCE_MMQ)
 								        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
 								#else
 								        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
 								#endif
 								#if defined(CUDA_USE_TENSOR_CORES)
 								        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
 								#else
 								        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 								#endif
-												ROCm Port (#1087)

* use hipblas based on cublas
* Update Makefile for the Cuda kernels
* Expand arch list and make it overrideable
* Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5)
* add hipBLAS to README
* new build arg LLAMA_CUDA_MMQ_Y
* fix half2 decomposition
* Add intrinsics polyfills for AMD
* AMD assembly optimized __dp4a
* Allow overriding CC_TURING
* use "ROCm" instead of "CUDA"
* ignore all build dirs
* Add Dockerfiles
* fix llama-bench
* fix -nommq help for non CUDA/HIP

---------

Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com>
Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com>
Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Co-authored-by: jammm <2500920+jammm@users.noreply.github.com>
Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
											
										
										
											2023-08-25 11:09:42 +02:00
+								        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-												sync : ggml (conv ops + cuda MSVC fixes) (#3765)

ggml-ci
											
										
										
											2023-10-24 20:51:20 +02:00
+								        for (int id = 0; id < g_device_count; ++id) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            int device_vmm = 0;
 								#if !defined(GGML_USE_HIPBLAS)
 								            CUdevice device;
 								            CU_CHECK(cuDeviceGet(&device, id));
 								            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
 								            if (device_vmm) {
 								                CUmemAllocationProp alloc_prop = {};
 								                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 								                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 								                alloc_prop.location.id = id;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            }
 								#endif // !defined(GGML_USE_HIPBLAS)
 								            g_device_caps[id].vmm = !!device_vmm;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								            cudaDeviceProp prop;
 								            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								            g_default_tensor_split[id] = total_vram;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								            total_vram += prop.totalGlobalMem;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#else
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								            g_device_caps[id].smpb = prop.sharedMemPerBlock;
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								        }
-												sync : ggml (conv ops + cuda MSVC fixes) (#3765)

ggml-ci
											
										
										
											2023-10-24 20:51:20 +02:00
+								        for (int id = 0; id < g_device_count; ++id) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								            g_default_tensor_split[id] /= total_vram;
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								        }
-												Add Q4_3 support to cuBLAS (#1086)


											
										
										
											2023-04-20 20:49:53 +02:00
-												sync : ggml (conv ops + cuda MSVC fixes) (#3765)

ggml-ci
											
										
										
											2023-10-24 20:51:20 +02:00
+								        for (int id = 0; id < g_device_count; ++id) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            ggml_cuda_set_device(id);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            // create cuda streams
-												sync : ggml (conv ops + cuda MSVC fixes) (#3765)

ggml-ci
											
										
										
											2023-10-24 20:51:20 +02:00
+								            for (int is = 0; is < MAX_STREAMS; ++is) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
 								            }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								            // create cublas handle
 								            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
 								            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
 								        }
-												cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
											
										
										
											2023-04-29 02:04:18 +02:00
-												Improve cuBLAS performance by using a memory pool (#1094)

* Improve cuBLAS performance by using a memory pool

* Move cuda specific definitions to ggml-cuda.h/cu

* Add CXX flags to nvcc

* Change memory pool synchronization mechanism to a spin lock
General code cleanup
											
										
										
											2023-04-21 21:59:17 +02:00
+								        // configure logging to stdout
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								        initialized = true;
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
+								        g_cublas_loaded = true;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    }
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
 								        return nullptr;
-												Add Q4_3 support to cuBLAS (#1086)


											
										
										
											2023-04-20 20:49:53 +02:00
+								    }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
 								    void * ptr = nullptr;
 								    cudaError_t err = cudaMallocHost((void **) &ptr, size);
 								    if (err != cudaSuccess) {
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 16:10:51 +01:00
+								        // clear the error
-												Fixed WSL cuda's OOM error (#1594)

* In the function , add the cuda error bypass.

* remove excessive codes and prints

---------

Co-authored-by: liang <liangmanlai@126.com>
											
										
										
											2023-06-11 15:20:52 +02:00
+								        cudaGetLastError();
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								            size/1024.0/1024.0, cudaGetErrorString(err));
 								        return nullptr;
 								    }
 								    return ptr;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void ggml_cuda_host_free(void * ptr) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								    CUDA_CHECK(cudaFreeHost(ptr));
-												Improve cuBLAS performance by dequantizing on the GPU (#1065)


											
										
										
											2023-04-20 03:14:14 +02:00
+								}
-												cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
											
										
										
											2023-04-29 01:31:56 +02:00
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								static cudaError_t ggml_cuda_cpy_tensor_2d(
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    cudaMemcpyKind kind;
 								    char * src_ptr;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (src->backend == GGML_BACKEND_TYPE_CPU) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        kind = cudaMemcpyHostToDevice;
 								        src_ptr = (char *) src->data;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
 								        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        kind = cudaMemcpyDeviceToDevice;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        int id;
 								        CUDA_CHECK(cudaGetDevice(&id));
 								        src_ptr = (char *) extra->data_device[id];
 								    } else {
 								        GGML_ASSERT(false);
 								    }
 								    char * dst_ptr = (char *) dst;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int64_t ne0 = src->ne[0];
 								    const int64_t nb0 = src->nb[0];
 								    const int64_t nb1 = src->nb[1];
 								    const int64_t nb2 = src->nb[2];
 								    const int64_t nb3 = src->nb[3];
-												cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
											
										
										
											2023-04-29 01:31:56 +02:00
+								    const enum ggml_type type = src->type;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int64_t ts = ggml_type_size(type);
 								    const int64_t bs = ggml_blck_size(type);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    int64_t i1_diff = i1_high - i1_low;
-												cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
											
										
										
											2023-04-29 01:31:56 +02:00
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (nb0 == ts && nb1 == ts*ne0/bs) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    } else if (nb0 == ts) {
 								        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
 								    } else {
 								        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
 								            const void * rx = (const void *) ((const char *) x + i1*nb1);
 								            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
 								            // pretend the row is a matrix with cols=1
 								            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
 								            if (r != cudaSuccess) return r;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        return cudaSuccess;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    }
 								}
 								static void ggml_cuda_op_get_rows(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    GGML_ASSERT(src1->type == GGML_TYPE_I32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 								    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
 								    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    const int32_t * src1_i32 = (const int32_t *) src1_d;
 								    switch (src0->type) {
 								        case GGML_TYPE_F16:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_F32:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_Q4_0:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_Q4_1:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_Q5_0:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_Q5_1:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        case GGML_TYPE_Q8_0:
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            break;
 								        default:
 								            // TODO: k-quants
-												cuda : better error message for ggml_get_rows (#4561)

* Update ggml-cuda.cu

* Update ggml-cuda.cu

* Update ggml-cuda.cu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 18:06:44 +01:00
+								            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            GGML_ASSERT(false);
 								            break;
 								    }
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								template<class op>
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_bin_bcast(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : revert CUDA broadcast changes from #2183 (#2191)


											
										
										
											2023-07-12 09:54:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
+								    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
+								    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
-												finetune : add -ngl parameter (#3762)

* Add '-ngl' support to finetune.cpp

* Add fprintf in ggml_cuda_op_add

When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora

* Add 'finetune.sh', which currently fails when using GPU

"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"

* tweak finetune.sh

* Suppress some warnings in ggml.c

* Add f16 implementation to ggml_compute_forward_add_f16_f32

* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs

* finetune.sh: Edit comments

* Add "add_f16_f32_f32_cuda"

* Tweak an error message

* finetune.sh: Add an optional LLAMA_MODEL_DIR variable

* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable

* train : minor

* tabs to spaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
											
										
										
											2023-11-01 12:49:04 +01:00
+								    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
+								    } else {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
 								            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-												CUDA GPU acceleration for LoRAs + f16 models (#1970)


											
										
										
											2023-06-28 18:35:54 +02:00
+								        GGML_ASSERT(false);
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
 								static void ggml_cuda_op_repeat(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    (void) src1;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    (void) src1_d;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_add(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_acc(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
 								    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
 								    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
 								    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
 								    int offset = dst->op_params[3] / 4; // offset in bytes
 								    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
 								    (void) dst;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_mul(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 								}
-												cuda : support broadcast add & mul (#2192)

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-07-14 20:38:24 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_div(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_gelu(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_silu(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_gelu_quick(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_tanh(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_relu(
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static void ggml_cuda_op_hardsigmoid(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
 								static void ggml_cuda_op_hardswish(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_leaky_relu(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    float negative_slope;
 								    memcpy(&negative_slope, dst->op_params, sizeof(float));
 								    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_sqr(
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_norm(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
 								    const int64_t ne00 = src0->ne[0];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t nrows = ggml_nrows(src0);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
 								    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_group_norm(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    int num_groups = dst->op_params[0];
 								    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_concat(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
 								        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
 								    }
 								    (void) src1;
 								    (void) dst;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_upscale(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 								    const int scale_factor = dst->op_params[0];
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    (void) src1;
 								    (void) dst;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    (void) src1_dd;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_pad(
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 								    pad_f32_cuda(src0_dd, dst_dd,
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
 								        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
 								static void ggml_cuda_op_arange(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    float start;
 								    float stop;
 								    float step;
 								    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
 								    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
 								    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
 								    int64_t steps = (int64_t)ceil((stop - start) / step);
 								    GGML_ASSERT(ggml_nelements(dst) == steps);
 								    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
 								    (void) src0;
 								    (void) src1;
 								    (void) src0_dd;
 								    (void) src1_dd;
 								}
 								static void ggml_cuda_op_timestep_embedding(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 								    const int dim = dst->op_params[0];
 								    const int max_period = dst->op_params[1];
 								    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
 								    (void) src1;
 								    (void) dst;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    (void) src1_dd;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_rms_norm(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int64_t ne00 = src0->ne[0];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t nrows = ggml_nrows(src0);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												make rms_norm_eps a parameter (#2374)

* make rms_norm_eps a parameter

* add rms_norm_eps to command line

* fix baby llama, test-grad0

* use scientific notation for eps param in the help

ggml-ci
											
										
										
											2023-07-24 17:57:12 +02:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_mul_mat_q(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne10 = src1->ne[0];
 								    GGML_ASSERT(ne10 % QK8_1 == 0);
 								    const int64_t ne0 = dst->ne[0];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t row_diff = row_high - row_low;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    // the main device has a larger memory buffer to hold the results from all GPUs
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								    // nrows_dst == nrows of the matrix that the kernel writes into
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
 								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q4_1:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q5_0:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q5_1:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q8_0:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q2_K:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q3_K:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q4_K:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q5_K:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        case GGML_TYPE_Q6_K:
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_ddf_i;
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								    int64_t min_compute_capability = INT_MAX;
 								    int64_t max_compute_capability = INT_MIN;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    for (int id = 0; id < g_device_count; ++id) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            if (min_compute_capability > g_device_caps[id].cc) {
 								                min_compute_capability = g_device_caps[id].cc;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								            }
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            if (max_compute_capability < g_device_caps[id].cc) {
 								                max_compute_capability = g_device_caps[id].cc;
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								            }
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        }
 								    }
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 								    switch(type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
 								        case GGML_TYPE_F16:
-												cuda : get_row_rounding F32 (#4095)

* Fix #4017

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
											
										
										
											2023-11-17 09:01:15 +01:00
+								        case GGML_TYPE_F32:
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								            return 1;
 								        case GGML_TYPE_Q2_K:
 								            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
 								        case GGML_TYPE_Q3_K:
 								            return min_compute_capability < CC_RDNA2 ? 128 : 64;
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        case GGML_TYPE_IQ2_XS:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								        case GGML_TYPE_IQ2_S:
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								        case GGML_TYPE_IQ1_S:
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        case GGML_TYPE_IQ3_S:
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
 								        default:
 								            GGML_ASSERT(false);
 								    }
 								#else
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								    switch(type) {
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								            return 64;
 								        case GGML_TYPE_F16:
-												cuda : get_row_rounding F32 (#4095)

* Fix #4017

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
											
										
										
											2023-11-17 09:01:15 +01:00
+								        case GGML_TYPE_F32:
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								            return 1;
 								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        case GGML_TYPE_IQ2_XS:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								        case GGML_TYPE_IQ2_S:
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								        case GGML_TYPE_IQ1_S:
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        case GGML_TYPE_IQ3_S:
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								        case GGML_TYPE_Q6_K:
 								            return 64;
 								        default:
 								            GGML_ASSERT(false);
 								    }
-												CUDA: mul_mat_q RDNA2 tunings (#2910)

* CUDA: mul_mat_q RDNA2 tunings

* Update ggml-cuda.cu

Co-authored-by: Henri Vasserman <henv@hot.ee>

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
											
										
										
											2023-09-13 11:20:24 +02:00
+								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
 								    const int64_t nrows = ggml_nrows(tensor);
 								    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
 								    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
 								    *row_low -= *row_low % rounding;
 								    if (id == g_device_count - 1) {
 								        *row_high = nrows;
 								    } else {
 								        *row_high = nrows*tensor_split[id + 1];
 								        *row_high -= *row_high % rounding;
 								    }
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_mul_mat_vec_q(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    const int64_t ne00 = src0->ne[0];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t row_diff = row_high - row_low;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								    const int64_t ne10 = src1->ne[0];
 								    GGML_ASSERT(ne10 % QK8_1 == 0);
 								    const int64_t ne0 = dst->ne[0];
 								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    // the main device has a larger memory buffer to hold the results from all GPUs
 								    // nrows_dst == nrows of the matrix that the kernel writes into
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q4_1:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q5_0:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q5_1:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q8_0:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q2_K:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q3_K:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q4_K:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q5_K:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
 								        case GGML_TYPE_Q6_K:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            break;
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								        case GGML_TYPE_IQ2_XXS:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
+								            break;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								        case GGML_TYPE_IQ2_XS:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												ggml : SOTA 2-bit quants (add IQ2_XS) (#4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-11 20:39:39 +01:00
+								            break;
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								        case GGML_TYPE_IQ2_S:
 								            mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								        case GGML_TYPE_IQ3_XXS:
-												CUDA: mul_mat_vec_q for batch sizes > 1 (#5351)


											
										
										
											2024-02-06 14:44:06 +01:00
+								            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-												CUDA: fixed mmvq kernel for bs 2,3,4 and -sm row (#5386)


											
										
										
											2024-02-07 12:40:26 +01:00
+								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-												SOTA 3-bit quants  (#5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-30 14:14:12 +01:00
+								            break;
-.5 bit quantization (#5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-18 17:16:55 +01:00
+								        case GGML_TYPE_IQ1_S:
 								            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								        case GGML_TYPE_IQ4_NL:
 								            mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								        case GGML_TYPE_IQ4_XS:
 								            mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
-												IQ3_S: a much better alternative to Q3_K (#5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-24 15:23:52 +01:00
+								        case GGML_TYPE_IQ3_S:
 								            mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
 								                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
 								            break;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1;
 								    (void) dst;
 								    (void) src1_ddf_i;
 								    (void) src1_ncols;
 								    (void) src1_padded_row_size;
 								}
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_dequantize_mul_mat_vec(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												Quantized dot products for CUDA mul mat vec (#2067)


											
										
										
											2023-07-05 14:19:42 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t row_diff = row_high - row_low;
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#ifdef GGML_CUDA_F16
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    cuda_pool_alloc<half> src1_dfloat_a;
 								    half * src1_dfloat = nullptr; // dfloat == half
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    bool src1_convert_f16 =
 								        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
 								        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
 								    if (src1_convert_f16) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        src1_dfloat = src1_dfloat_a.alloc(ne00);
-												cuda : fix LLAMA_CUDA_F16 (#5262)


											
										
										
											2024-02-01 18:30:17 +01:00
+								        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								        GGML_ASSERT(to_fp16_cuda != nullptr);
 								        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
+								#else
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
+								#endif // GGML_CUDA_F16
-												Convert vector to f16 for dequantize mul mat vec (#1913)

* Convert vector to f16 for dmmv

* compile option

* Added compilation option description to README

* Changed cmake CUDA_ARCHITECTURES from "OFF" to "native"
											
										
										
											2023-06-19 10:23:56 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    switch (src0->type) {
 								        case GGML_TYPE_Q4_0:
 								            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q4_1:
 								            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_0:
 								            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_1:
 								            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q8_0:
 								            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q2_K:
 								            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q3_K:
 								            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q4_K:
 								            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q5_K:
 								            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_Q6_K:
 								            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        case GGML_TYPE_F16:
 								            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 								            break;
 								        default:
 								            GGML_ASSERT(false);
 								            break;
 								    }
-												cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
											
										
										
											2023-05-01 13:32:22 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_ddq_i;
 								    (void) src1_ncols;
 								    (void) src1_padded_row_size;
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_mul_mat_cublas(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 								    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const int64_t src1_padded_row_size, cudaStream_t stream) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
+								    GGML_ASSERT(src0_dd_i  != nullptr);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    GGML_ASSERT(src1_ddf_i != nullptr);
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
+								    GGML_ASSERT(dst_dd_i   != nullptr);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne10 = src1->ne[0];
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int64_t ne0 = dst->ne[0];
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t row_diff = row_high - row_low;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    int id;
 								    CUDA_CHECK(cudaGetDevice(&id));
 								    // the main device has a larger memory buffer to hold the results from all GPUs
 								    // ldc == nrows of the matrix that cuBLAS writes into
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    const int compute_capability = g_device_caps[id].cc;
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        //printf("this branch\n");
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        cuda_pool_alloc<half> src0_as_f16;
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								        if (src0->type != GGML_TYPE_F16) {
 								            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
 								            GGML_ASSERT(to_fp16_cuda != nullptr);
 								            size_t ne = row_diff*ne00;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            src0_as_f16.alloc(ne);
 								            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
+								        }
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
-												ggml-cuda : perform cublas mat mul of quantized types as f16 (#3412)

* ggml-cuda : perform cublas matrix multiplication of quantized types as fp16

* rename CC_TURING to CC_VOLTA

* disable fp16 mat mul completely with multi GPU
											
										
										
											2023-09-30 18:12:57 +02:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        cuda_pool_alloc<half> src1_as_f16;
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								        if (src1->type != GGML_TYPE_F16) {
 								            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								            GGML_ASSERT(to_fp16_cuda != nullptr);
 								            size_t ne = src1_ncols*ne10;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            src1_as_f16.alloc(ne);
 								            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								        }
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
 								        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
 								        const half alpha_f16 = 1.0f;
 								        const half beta_f16 = 0.0f;
 								        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
 								        CUBLAS_CHECK(
 								            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
 								                    row_diff, src1_ncols, ne10,
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
 								                                src1_ptr,       CUDA_R_16F, ne10,
 								                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								                    CUBLAS_COMPUTE_16F,
 								                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    } else {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        cuda_pool_alloc<float> src0_ddq_as_f32;
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        cuda_pool_alloc<float> src1_ddq_as_f32;
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
 								        if (src0->type != GGML_TYPE_F32) {
 								            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
 								            GGML_ASSERT(to_fp32_cuda != nullptr);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            src0_ddq_as_f32.alloc(row_diff*ne00);
 								            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
+								        }
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        if (src1->type != GGML_TYPE_F32) {
 								            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
 								            GGML_ASSERT(to_fp32_cuda != nullptr);
 								            src1_ddq_as_f32.alloc(src1_ncols*ne10);
 								            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
 								        }
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
-												ggml-cuda : perform cublas fp16 matrix multiplication as fp16 (#3370)

* ggml-cuda : perform cublas fp16 matrix multiplication as fp16

* try to fix rocm build

* restrict fp16 mat mul to volta and up
											
										
										
											2023-09-28 12:08:28 +02:00
 								        const float alpha = 1.0f;
 								        const float beta = 0.0f;
 								        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
 								        CUBLAS_CHECK(
 								            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
 								                    row_diff, src1_ncols, ne10,
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								                    &alpha, src0_ddf_i,  ne00,
 								                            src1_ddf1_i, ne10,
 								                    &beta,  dst_dd_i,    ldc));
-												CUDA: fix LoRAs (#3130)


											
										
										
											2023-09-13 00:15:33 +02:00
+								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_ddq_i;
 								    (void) src1_padded_row_size;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_rope(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src0->type == dst->type);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const int64_t ne00 = src0->ne[0];
-												CUDA: Implemented row flattening for non-glm RoPE (#2468)


											
										
										
											2023-07-31 14:32:30 +02:00
+								    const int64_t ne01 = src0->ne[1];
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								    const int64_t ne2 = dst->ne[2];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t nrows = ggml_nrows(src0);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    //const int n_past      = ((int32_t *) dst->op_params)[0];
 								    const int n_dims      = ((int32_t *) dst->op_params)[1];
 								    const int mode        = ((int32_t *) dst->op_params)[2];
 								    const int n_ctx       = ((int32_t *) dst->op_params)[3];
 								    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-												Custom RoPE + bettter memory management for CUDA (#2295)

* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2023-07-21 16:27:51 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    // RoPE alteration for extended context
 								    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 								    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
 								    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
 								    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
 								    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
 								    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
 								    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
 								    const int32_t * pos = nullptr;
 								    if ((mode & 1) == 0) {
 								        GGML_ASSERT(src1->type == GGML_TYPE_I32);
 								        GGML_ASSERT(src1->ne[0] == ne2);
 								        pos = (const int32_t *) src1_dd;
 								    }
-												cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
											
										
										
											2023-05-01 13:32:22 +02:00
-												llm : add Falcon support (#2717)

* llama : refactor GGUF constants into static maps

* llama : check if model architecture is known

* llama : refactor llama_model_load_internal()

* gguf : add KV constant maps

* llm : read arch-specific KVs

* convert : add dummy scores + types

* falcon : load tensor data (CPU only)

* llama : fix loading progress bar

* llama : add arch member to llama_model

* falcon : CPU inference working

* falcon : support non-40B models

* falcon : minor

* llama : minor updates

ggml-ci

* convert-falcon-hf-to-gguf.py : fix special token mapping

* llama.cpp : llama default UNK token = id 0

* llama.cpp : fix bpe tokenizer

* llama.cpp : fix the fix of bpe tokenizer

* ggml : pass eps to ggml_norm

* metal : implement RoPE (mode = 2) + avoid ggml_repeat

* ggml : ggml_repeat always creates new tensor

* falcon : copy-paste self-attention from LLaMA

* metal : print extra compute pipeline info

* falcon : minor changes (still chasing the Metal problem)

* llama.cpp : fix linefeed token

* metal : fix GELU kernel numerical stability by using precise::tanh

* metal : temporary workaround for the concurrency optimization bug

* falcon : add CUDA offloading (#2739)

* llama : better model naming and size reporting

* llama : prep new tokenizer support

* llama : advanced BPE tokenizer based on ggllm.cpp imlpementation

* llama : remove oboslete comment

ggml-ci

* common : remove obsolete BPE API + disable test-tokenizer-1

* llama : revert BPE special-case in llama_byte_to_token()

* cuda : add TODOs for RoPE NeoX implementation

* llama : default special tokens based on vocab type

* perplexity : add log for start of tokenization

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-08-23 22:08:04 +02:00
+								    const bool is_neox = mode & 2;
 								    const bool is_glm  = mode & 4;
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								    rope_corr_dims corr_dims;
 								    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    // compute
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    if (is_glm) {
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        GGML_ASSERT(false);
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-												llm : add Falcon support (#2717)

* llama : refactor GGUF constants into static maps

* llama : check if model architecture is known

* llama : refactor llama_model_load_internal()

* gguf : add KV constant maps

* llm : read arch-specific KVs

* convert : add dummy scores + types

* falcon : load tensor data (CPU only)

* llama : fix loading progress bar

* llama : add arch member to llama_model

* falcon : CPU inference working

* falcon : support non-40B models

* falcon : minor

* llama : minor updates

ggml-ci

* convert-falcon-hf-to-gguf.py : fix special token mapping

* llama.cpp : llama default UNK token = id 0

* llama.cpp : fix bpe tokenizer

* llama.cpp : fix the fix of bpe tokenizer

* ggml : pass eps to ggml_norm

* metal : implement RoPE (mode = 2) + avoid ggml_repeat

* ggml : ggml_repeat always creates new tensor

* falcon : copy-paste self-attention from LLaMA

* metal : print extra compute pipeline info

* falcon : minor changes (still chasing the Metal problem)

* llama.cpp : fix linefeed token

* metal : fix GELU kernel numerical stability by using precise::tanh

* metal : temporary workaround for the concurrency optimization bug

* falcon : add CUDA offloading (#2739)

* llama : better model naming and size reporting

* llama : prep new tokenizer support

* llama : advanced BPE tokenizer based on ggllm.cpp imlpementation

* llama : remove oboslete comment

ggml-ci

* common : remove obsolete BPE API + disable test-tokenizer-1

* llama : revert BPE special-case in llama_byte_to_token()

* cuda : add TODOs for RoPE NeoX implementation

* llama : default special tokens based on vocab type

* perplexity : add log for start of tokenization

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-08-23 22:08:04 +02:00
+								    } else if (is_neox) {
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        if (src0->type == GGML_TYPE_F32) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								            rope_neox_cuda(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								                attn_factor, corr_dims, main_stream
 								            );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        } else if (src0->type == GGML_TYPE_F16) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								            rope_neox_cuda(
-												ggml-cuda : support stablelm rope (#4156)

* ggml-cuda : support stablelm rope

* remove unused freq_base kernel parameter

* add n_dims parameter to llm_build_k_shift, default to n_rot via overload

* llama : fix llm_build_k_shift args

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-24 18:04:31 +01:00
+								                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								                attn_factor, corr_dims, main_stream
 								            );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        } else {
 								            GGML_ASSERT(false);
 								        }
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    } else {
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        if (src0->type == GGML_TYPE_F32) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								            rope_cuda(
 								                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        } else if (src0->type == GGML_TYPE_F16) {
-												llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>

											
										
										
											2023-11-01 23:04:33 +01:00
+								            rope_cuda(
 								                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
 								                attn_factor, corr_dims, main_stream
 								            );
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								        } else {
 								            GGML_ASSERT(false);
 								        }
-												ggml : sync (ggml_conv_2d, fix mul_mat bug, CUDA GLM rope)

											
										
										
											2023-07-14 15:36:41 +02:00
+								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												ggml: move op parameters from tensors to ggml_tensor::op_params (#2333)

* ggml: move op parameters from tensors to ggml_tensor::op_params

* alibi: use memcpy for float params

* remove `src[1] = NULL` in ops

											
										
										
											2023-07-23 14:36:02 +02:00
+								    (void) src1;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
											
										
										
											2023-04-29 02:04:18 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_alibi(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t nrows = ggml_nrows(src0);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								    //const int n_past = ((int32_t *) dst->op_params)[0];
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								    const int n_head = ((int32_t *) dst->op_params)[1];
 								    float max_bias;
 								    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								    //GGML_ASSERT(ne01 + n_past == ne00);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								    GGML_ASSERT(n_head == ne02);
 								    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
 								    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
 								    (void) src1;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static void ggml_cuda_op_pool2d(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
 								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    const int32_t * opts = (const int32_t *)dst->op_params;
 								    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
 								    const int k0 = opts[1];
 								    const int k1 = opts[2];
 								    const int s0 = opts[3];
 								    const int s1 = opts[4];
 								    const int p0 = opts[5];
 								    const int p1 = opts[6];
 								    const int64_t IH = src0->ne[1];
 								    const int64_t IW = src0->ne[0];
 								    const int64_t N = dst->ne[3];
 								    const int64_t OC = dst->ne[2];
 								    const int64_t OH = dst->ne[1];
 								    const int64_t OW = dst->ne[0];
 								    const int parallel_elements = N * OC * OH * OW;
 								    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
 								    dim3 block_nums(num_blocks);
 								    pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
 								    (void) src1;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_im2col(
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
 								    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 								    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
 								    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
 								    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
 								    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
 								    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
 								    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
 								    const int64_t IC = src1->ne[is_2D ? 2 : 1];
 								    const int64_t IH = is_2D ? src1->ne[1] : 1;
 								    const int64_t IW =         src1->ne[0];
 								    const int64_t KH = is_2D ? src0->ne[1] : 1;
 								    const int64_t KW =         src0->ne[0];
 								    const int64_t OH = is_2D ? dst->ne[2] : 1;
 								    const int64_t OW =         dst->ne[1];
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    const int64_t batch = src1->ne[3];
 								    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								    if(dst->type == GGML_TYPE_F16) {
 								        im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
 								    } else {
 								        im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
 								    }
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
 								    (void) src0;
 								    (void) src0_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_sum_rows(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    const int64_t ncols = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
 								    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_argsort(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_I32);
 								    const int64_t ncols = src0->ne[0];
 								    const int64_t nrows = ggml_nrows(src0);
 								    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 								    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_diag_mask_inf(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int nrows0 = ggml_nrows(src0);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												ggml: move op parameters from tensors to ggml_tensor::op_params (#2333)

* ggml: move op parameters from tensors to ggml_tensor::op_params

* alibi: use memcpy for float params

* remove `src[1] = NULL` in ops

											
										
										
											2023-07-23 14:36:02 +02:00
+								    const int n_past = ((int32_t *) dst->op_params)[0];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												ggml: move op parameters from tensors to ggml_tensor::op_params (#2333)

* ggml: move op parameters from tensors to ggml_tensor::op_params

* alibi: use memcpy for float params

* remove `src[1] = NULL` in ops

											
										
										
											2023-07-23 14:36:02 +02:00
+								    (void) src1;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_soft_max(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    const int64_t ne00    = src0->ne[0];
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
+								    const int64_t nrows_x = ggml_nrows(src0);
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    const int64_t nrows_y = src0->ne[1];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    float scale    = 1.0f;
 								    float max_bias = 0.0f;
-												ggml : add ggml_soft_max_ext (#4256)

* metal : implement soft_max_ext

* cuda : implement soft_max_ext

* ggml : implement soft_max_ext (CPU)

* batched-bench : print threads

ggml-ci

* metal : simplify soft_max encoding

ggml-ci

* cuda : use 512 threads for soft_max instead of 32

* ggml : update soft max cpu

* cuda : do warp-based block reduce

* cuda : increase max block size to 1024

* cuda : fix warp reduction initialization of shared mem

* metal : warp-based reduction for soft max kernel

* metal : warp-based reduce for rms_norm

* metal : simplify soft max kernel

ggml-ci

* alloc : fix build with debug
											
										
										
											2023-12-01 09:51:24 +01:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
 								    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    // positions tensor
-												cuda, metal : fix nans in soft_max (#5574)

* cuda : fix nans in soft_max

* metal : fix nans in soft_max

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-19 09:04:45 +01:00
+								    float * src2_dd = nullptr;
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    cuda_pool_alloc<float> src2_f;
 								    ggml_tensor * src2 = dst->src[2];
 								    const bool use_src2 = src2 != nullptr;
 								    if (use_src2) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
 								        if (src2_on_device) {
-												cuda, metal : fix nans in soft_max (#5574)

* cuda : fix nans in soft_max

* metal : fix nans in soft_max

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-19 09:04:45 +01:00
+								            ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								            src2_dd = (float *) src2_extra->data_device[g_main_device];
 								        } else {
 								            src2_dd = src2_f.alloc(ggml_nelements(src2));
 								            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
 								        }
-												CUDA: faster softmax via shared memory + fp16 math (#4742)


											
										
										
											2024-01-09 08:58:55 +01:00
+								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												ggml : add ALiBi support for ggml_soft_max_ext (#5488)

* ggml : avoid recomputing alibi slopes (CPU)

* llama : reuse hparams.f_max_alibi_bias in all cases

ggml-ci

* ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal)

ggml-ci

* ggml : handle all SRCs (do not break on first null)

ggml-ci

* tests : do not use slope for large soft_max

accumulates too much error

ggml-ci

* ggml : alternative ALiBi without extra tensor

We compute the slopes in the kernel

ggml-ci

* cuda : add ALiBi support in ggml_soft_max_ext

ggml-ci

* ggml : deprecate ggml_alibi

* ggml : support multi-sequence ALiBi (Metal)

ggml-ci

* cuda : add multi-seq ALiBi + remote F16 soft_max

ggml-ci

* ggml : update deprecation message

* ggml : fix pos ptr when no ALiBi

ggml-ci

* cuda : fix performance (pow -> powf)

* cuda : precompute ALiBi constants

* metal : pre-compute ALiBi slopes

ggml-ci

* llama : init kq_pos only if needed

ggml-ci

* test-backend-ops : add null pos test to soft_max

test-backend-ops : replace soft_max tests

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-17 22:04:16 +01:00
+								    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_scale(
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												llama : fix platforms without mmap (#4578)

* llama : fix platforms without mmap

* win32 : limit prefetch size to the file size

* fix win32 error clobber, unnecessary std::string in std::runtime_error
											
										
										
											2023-12-22 12:12:53 +01:00
+								    float scale;
 								    memcpy(&scale, dst->op_params, sizeof(float));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    CUDA_CHECK(cudaGetLastError());
 								    (void) src1;
 								    (void) dst;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    (void) src1_dd;
 								}
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								static void ggml_cuda_op_clamp(
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
+								    float min;
 								    float max;
 								    memcpy(&min, dst->op_params, sizeof(float));
 								    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
 								    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
 								    CUDA_CHECK(cudaGetLastError());
 								    (void) src1;
 								    (void) dst;
 								    (void) src1_dd;
 								}
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
 								    const int64_t nrows0 = ggml_nrows(src0);
 								    const bool use_src1 = src1 != nullptr;
 								    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 								    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
 								    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
 								    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
 								    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
 								    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
 								    // dd = data device
 								    float * src0_ddf = nullptr;
 								    float * src1_ddf = nullptr;
 								    float *  dst_ddf = nullptr;
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    cuda_pool_alloc<float> src0_f;
 								    cuda_pool_alloc<float> src1_f;
 								    cuda_pool_alloc<float>  dst_f;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
 								    ggml_cuda_set_device(g_main_device);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
 								    if (src0_on_device) {
 								        src0_ddf = (float *) src0_extra->data_device[g_main_device];
 								    } else {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
 								    }
-												ggml : change ggml_scale to take a float instead of tensor (#4573)

* ggml : change ggml_scale to take a float instead of tensor

* ggml : fix CPU implementation

* tests : fix test-grad0

ggml-ci
											
										
										
											2023-12-21 22:20:49 +01:00
+								    if (use_src1) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        if (src1_on_device) {
 								            src1_ddf = (float *) src1_extra->data_device[g_main_device];
 								        } else {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
 								        }
 								    }
 								    if (dst_on_device) {
 								        dst_ddf = (float *) dst_extra->data_device[g_main_device];
 								    } else {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
 								    // do the computation
 								    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
 								    CUDA_CHECK(cudaGetLastError());
 								    // copy dst to host if necessary
 								    if (!dst_on_device) {
 								        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
 								    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        CUDA_CHECK(cudaDeviceSynchronize());
 								    }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_set_peer_access(const int n_tokens) {
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
+								    static bool peer_access_enabled = false;
 								    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
 								    if (peer_access_enabled == enable_peer_access) {
 								        return;
 								    }
 								#ifdef NDEBUG
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								    for (int id = 0; id < g_device_count; ++id) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_set_device(id);
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								        CUDA_CHECK(cudaDeviceSynchronize());
 								    }
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
+								    for (int id = 0; id < g_device_count; ++id) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_set_device(id);
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
 								        for (int id_other = 0; id_other < g_device_count; ++id_other) {
 								            if (id == id_other) {
 								                continue;
 								            }
 								            if (id != g_main_device && id_other != g_main_device) {
 								                continue;
 								            }
-												CUDA: fix peer access logic (#3231)


											
										
										
											2023-09-17 23:35:20 +02:00
+								            int can_access_peer;
 								            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
 								            if (can_access_peer) {
 								                if (enable_peer_access) {
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 23:40:26 +01:00
+								                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
 								                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
 								                        CUDA_CHECK(err);
 								                    }
-												CUDA: fix peer access logic (#3231)


											
										
										
											2023-09-17 23:35:20 +02:00
+								                } else {
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 23:40:26 +01:00
+								                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
 								                    if (err != cudaErrorPeerAccessNotEnabled) {
 								                        CUDA_CHECK(err);
 								                    }
-												CUDA: fix peer access logic (#3231)


											
										
										
											2023-09-17 23:35:20 +02:00
+								                }
-												CUDA: enable peer access between devices (#2470)


											
										
										
											2023-09-17 16:37:53 +02:00
+								            }
 								        }
 								    }
 								#endif // NDEBUG
 								    peer_access_enabled = enable_peer_access;
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								// FIXME: move this somewhere else
 								struct ggml_backend_cuda_split_buffer_type_context {
 								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 								};
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								static void ggml_cuda_op_mul_mat(
 								    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
 								    const bool convert_src1_to_q8_1) {
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
 								    const int64_t ne03 = src0->ne[3];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
 								    const int64_t ne13 = src1->ne[3];
 								    const int64_t nrows1 = ggml_nrows(src1);
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
 								    GGML_ASSERT(ne03 == ne13);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								    const int64_t ne0 = dst->ne[0];
 								    const int64_t ne1 = dst->ne[1];
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int nb2 = dst->nb[2];
 								    const int nb3 = dst->nb[3];
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 								    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t i02_divisor = ne12 / ne02;
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    const size_t src0_ts = ggml_type_size(src0->type);
 								    const size_t src0_bs = ggml_blck_size(src0->type);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const size_t q8_1_ts = sizeof(block_q8_1);
 								    const size_t q8_1_bs = QK8_1;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 								    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
 								    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    GGML_ASSERT(!(split && ne02 > 1));
 								    GGML_ASSERT(!(split && ne03 > 1));
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    GGML_ASSERT(!(split && ne02 < ne12));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 								    if (split) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
 								        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
 								        tensor_split = buft_ctx->tensor_split;
 								    }
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    struct dev_data {
 								        cuda_pool_alloc<char>  src0_dd_alloc;
 								        cuda_pool_alloc<float> src1_ddf_alloc;
 								        cuda_pool_alloc<char>  src1_ddq_alloc;
 								        cuda_pool_alloc<float>   dst_dd_alloc;
 								        char  *  src0_dd = nullptr;
 								        float * src1_ddf = nullptr; // float
 								        char  * src1_ddq = nullptr; // q8_1
 								        float *   dst_dd = nullptr;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        int64_t  row_low;
 								        int64_t row_high;
 								    };
-												Only one CUDA stream per device for async compute (#1898)


											
										
										
											2023-06-17 19:15:02 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    dev_data dev[GGML_CUDA_MAX_DEVICES];
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-11-05 16:08:57 +01:00
+								    int used_devices = 0;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    for (int id = 0; id < g_device_count; ++id) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        // by default, use all rows
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        dev[id].row_low  = 0;
 								        dev[id].row_high = ne01;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        // for multi GPU, get the row boundaries from tensor split
 								        // and round to mul_mat_q tile sizes
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        if (split) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
-												CUDA: tuned mul_mat_q kernels (#2546)


											
										
										
											2023-08-09 09:42:34 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            if (id != 0) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								                dev[id].row_low  = ne01*tensor_split[id];
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                if (dev[id].row_low < ne01) {
 								                    dev[id].row_low -= dev[id].row_low % rounding;
-												CUDA: fixed row rounding for 0 tensor splits (#4594)


											
										
										
											2023-12-23 09:16:33 +01:00
+								                }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            }
-												CUDA: Quantized matrix matrix multiplication (#2160)

* mmq implementation for non k-quants

* q6_K

* q2_K

* q3_k

* q4_K

* vdr

* q5_K

* faster q8_1 loading

* loop unrolling

* add __restrict__

* q2_K sc_high

* GGML_CUDA_MMQ_Y

* Updated Makefile

* Update Makefile

* DMMV_F16 -> F16

* Updated README, CMakeLists

* Fix CMakeLists.txt

* Fix CMakeLists.txt

* Fix multi GPU out-of-bounds
											
										
										
											2023-07-29 23:04:44 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            if (id != g_device_count - 1) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								                dev[id].row_high  = ne01*tensor_split[id + 1];
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                if (dev[id].row_high < ne01) {
 								                    dev[id].row_high -= dev[id].row_high % rounding;
-												CUDA: fixed row rounding for 0 tensor splits (#4594)


											
										
										
											2023-12-23 09:16:33 +01:00
+								                }
-												CUDA: Fix models with output size != 32000 (#2480)


											
										
										
											2023-08-02 16:48:10 +02:00
+								            }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    for (int id = 0; id < g_device_count; ++id) {
 								        if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								            continue;
 								        }
-												cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-11-05 16:08:57 +01:00
+								        used_devices++;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
 								        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        ggml_cuda_set_device(id);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        cudaStream_t stream = g_cudaStreams[id][0];
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        if (src0_on_device && src0_is_contiguous) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].src0_dd = (char *) src0_extra->data_device[id];
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        } else {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        if (src1_on_device && src1_is_contiguous) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].src1_ddf = (float *) src1_extra->data_device[id];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        } else {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        if (convert_src1_to_q8_1) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            if (src1_on_device && src1_is_contiguous) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
-												cuda : revert CUDA pool stuff (#3944)

* Revert "cuda : add ROCM aliases for CUDA pool stuff (#3918)"

This reverts commit 629f917cd6b96ba1274c49a8aab163b1b189229d.

* Revert "cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903)"

This reverts commit d6069051de7165a4e06662c89257f5d2905bb156.

ggml-ci
											
										
										
											2023-11-05 08:12:13 +01:00
+								                CUDA_CHECK(cudaGetLastError());
-												ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-05-13 15:38:36 +02:00
+								            }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        if (dst_on_device) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            dev[id].dst_dd = (float *) dst_extra->data_device[id];
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        } else {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
 								            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    // if multiple devices are used they need to wait for the main device
 								    // here an event is recorded that signals that the main device has finished calculating the input data
-												cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-11-05 16:08:57 +01:00
+								    if (split && used_devices > 1) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_set_device(g_main_device);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-11-05 16:08:57 +01:00
+								    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
 								        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
 								        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        for (int id = 0; id < g_device_count; ++id) {
 								            if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                continue;
 								            }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
 								            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
-												Windows nvcc workaround (#1753)

Fix gibberish output on Windows when using CUDA
											
										
										
											2023-06-09 13:58:15 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            ggml_cuda_set_device(id);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								            cudaStream_t stream = g_cudaStreams[id][is];
-												Windows nvcc workaround (#1753)

Fix gibberish output on Windows when using CUDA
											
										
										
											2023-06-09 13:58:15 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            // wait for main GPU data if necessary
 								            if (split && (id != g_main_device || is != 0)) {
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            }
 								            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
 								                const int64_t i03 = i0 / ne12;
 								                const int64_t i02 = i0 % ne12;
 								                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								                // for split tensors the data begins at i0 == i0_offset_low
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
 								                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
 								                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
 								                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								                // the main device memory buffer can be on VRAM scratch, with space for all partial results
 								                // in that case an offset on dst_ddf_i is needed
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                }
 								                // copy src0, src1 to device if necessary
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                    if (id != g_main_device) {
 								                        if (convert_src1_to_q8_1) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                            char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
 								                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
 								                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                        } else {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
 								                                                            src1_ncols*ne10*sizeof(float), stream));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                        }
 								                    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                } else {
 								                    GGML_ASSERT(false);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                }
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
 								                    CUDA_CHECK(cudaGetLastError());
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                // do the computation
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								                CUDA_CHECK(cudaGetLastError());
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
 								                // copy dst to host or other device if necessary
 								                if (!dst_on_device) {
 								                    void * dst_off_device;
 								                    cudaMemcpyKind kind;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                        dst_off_device = dst->data;
 								                        kind = cudaMemcpyDeviceToHost;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                        dst_off_device = dst_extra->data_device[g_main_device];
 								                        kind = cudaMemcpyDeviceToDevice;
 								                    } else {
 								                        GGML_ASSERT(false);
 								                    }
 								                    if (split) {
 								                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
 								                        // dst is NOT transposed.
-												CUDA: faster multi GPU synchronization (#2448)


											
										
										
											2023-07-29 23:04:10 +02:00
+								                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
 								                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
 								                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
 								#if !defined(GGML_USE_HIPBLAS)
 								                        if (kind == cudaMemcpyDeviceToDevice) {
 								                            // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
 								                            cudaMemcpy3DPeerParms p = {};
 								                            p.dstDevice = g_main_device;
 								                            p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
 								                            p.srcDevice = id;
 								                            p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
 								                            p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
 								                            CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
 								                        } else
 								#endif
 								                        {
 								                            CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
 								                                                            dst_dd_i, row_diff*sizeof(float),
 								                                                            row_diff*sizeof(float), src1_ncols,
 								                                                            kind, stream));
 								                        }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                    } else {
 								                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
 								                        dhf_dst_i += src1_col_0*ne0;
 								                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                    }
 								                }
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								                // add event for the main device to wait on until other device is done
 								                if (split && (id != g_main_device || is != 0)) {
 								                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								                }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								            }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								        }
 								    }
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								    // main device waits for all other devices to be finished
 								    if (split && g_device_count > 1) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
 								        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_set_device(g_main_device);
 								        for (int id = 0; id < g_device_count; ++id) {
 								            if (dev[id].row_low == dev[id].row_high) {
-												cuda : fix disabling device with --tensor-split 1,0 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-11-05 16:08:57 +01:00
+								                continue;
 								            }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								            for (int64_t is = 0; is < is_max; ++is) {
-												Enable build with CUDA 11.0 (make) (#3132)

* CUDA 11.0 fixes

* Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
											
										
										
											2023-09-16 16:55:43 +02:00
+								                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								            }
 								        }
 								    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								        ggml_cuda_set_device(g_main_device);
-												Better CUDA synchronization logic (#2057)


											
										
										
											2023-07-01 21:49:44 +02:00
+								        CUDA_CHECK(cudaDeviceSynchronize());
 								    }
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
 								}
 								static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
-												cuda : add gelu support

											
										
										
											2023-07-12 19:26:18 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
 								}
 								static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
 								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
 								}
 								static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
 								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
 								}
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								}
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
 								}
 								static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
 								}
 								static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
 								}
 								static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
 								    const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
 								    // dd = data device
 								    float * src0_ddf = nullptr;
 								    float * src1_ddf = nullptr;
 								    float *  dst_ddf = nullptr;
 								    cuda_pool_alloc<float>  dst_f;
 								    ggml_cuda_set_device(g_main_device);
 								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 								    if (dst_on_device) {
 								        dst_ddf = (float *) dst_extra->data_device[g_main_device];
 								    } else {
 								        dst_ddf = dst_f.alloc(ggml_nelements(dst));
 								    }
 								    // do the computation
 								    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
 								    CUDA_CHECK(cudaGetLastError());
 								    // copy dst to host if necessary
 								    if (!dst_on_device) {
 								        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
 								    }
 								    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
 								        CUDA_CHECK(cudaDeviceSynchronize());
 								    }
 								}
 								static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (!g_cublas_loaded) return false;
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne0 = dst->ne[0];
 								    const int64_t ne1 = dst->ne[1];
 								    // TODO: find the optimal values for these
-												llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params

ggml-ci

* fix metal build

* fix freq_base/scale default to model value

* llama-bench : keep the same model between tests when possible

* move n_threads to llama_context_params, add n_threads_batch

* fix mpi build

* remove kv_size(), cuda scratch fixes

* remove low-vram option

* add n_threads_batch to system info, refactor to get_system_info()

* add documentation about --threads-batch to the READMEs

* llama-bench fix

* main : fix rope freq/scale warning

* llama.cpp : add llama_get_model
common : add llama_tokenize from model

* remove duplicated ctx/model functions

ggml-ci

* cuda : print total VRAM used
											
										
										
											2023-09-28 21:42:38 +02:00
+								    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
 								            src1->type == GGML_TYPE_F32 &&
 								             dst->type == GGML_TYPE_F32 &&
 								            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
 								    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
-												llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
											
										
										
											2023-07-23 14:09:47 +02:00
+								    const int64_t ne12 = src1->ne[2];
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(g_main_device);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    void * src0_ddq = src0_extra->data_device[g_main_device];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    GGML_ASSERT(!ggml_is_transposed(src0));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    GGML_ASSERT(!ggml_is_permuted(src0));
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const int64_t ne02 = src0->ne[2];
 								    const int64_t nb01 = src0->nb[1];
 								    const int64_t nb02 = src0->nb[2];
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    const int64_t ne12 = src1->ne[2];
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(g_main_device);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    void * src0_ddq = src0_extra->data_device[g_main_device];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    const int64_t row_stride_x = nb01 / sizeof(half);
 								    const int64_t channel_stride_x = nb02 / sizeof(half);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static __global__ void k_compute_batched_ptrs(
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								        const half * src0_as_f16, const half * src1_as_f16, char * dst,
-												cuda : fix const ptrs warning causing ROCm build issues (#3913)


											
										
										
											2023-11-02 19:32:11 +01:00
+								        const void ** ptrs_src, void ** ptrs_dst,
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								        int64_t ne12, int64_t ne13,
 								        int64_t ne23,
 								        size_t  nb02, size_t  nb03,
 								        size_t  nb12, size_t  nb13,
 								        size_t  nbd2, size_t  nbd3,
 								        int64_t r2,   int64_t r3) {
 								    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
 								    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
 								    if (i13 >= ne13 || i12 >= ne12) {
 								        return;
 								    }
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    int64_t i03 = i13 / r3;
 								    int64_t i02 = i12 / r2;
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
 								    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
 								    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
+								}
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    GGML_ASSERT(!ggml_is_transposed(src0));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    const int64_t ne_dst = ggml_nelements(dst);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(g_main_device);
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
 								    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 								    void * src0_ddq = src0_extra->data_device[g_main_device];
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    half * src0_f16 = (half *) src0_ddq;
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
 								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 								    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
 								    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 								    // convert src1 to fp16
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								    cuda_pool_alloc<half> src1_f16_alloc;
 								    if (src1->type != GGML_TYPE_F16) {
 								        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								        const int64_t ne_src1 = ggml_nelements(src1);
 								        src1_f16_alloc.alloc(ne_src1);
 								        GGML_ASSERT(to_fp16_cuda != nullptr);
 								        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
 								    }
 								    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								    cuda_pool_alloc<half> dst_f16;
 								    char * dst_t;
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
 								    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
 								    cudaDataType_t      cu_data_type    = CUDA_R_16F;
 								    // dst strides
 								    size_t nbd2 = dst->nb[2];
 								    size_t nbd3 = dst->nb[3];
 								    const half  alpha_f16 = 1.0f;
 								    const half  beta_f16  = 0.0f;
 								    const float alpha_f32 = 1.0f;
 								    const float beta_f32  = 0.0f;
 								    const void * alpha = &alpha_f16;
 								    const void * beta  = &beta_f16;
 								    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        dst_t = (char *) dst_f16.alloc(ne_dst);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
 								        nbd2 /= sizeof(float) / sizeof(half);
 								        nbd3 /= sizeof(float) / sizeof(half);
 								    } else {
 								        dst_t = (char *) dst_ddf;
 								        cu_compute_type = CUBLAS_COMPUTE_32F;
 								        cu_data_type    = CUDA_R_32F;
 								        alpha = &alpha_f32;
 								        beta  = &beta_f32;
 								    }
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
 								    GGML_ASSERT(ne12 % ne02 == 0);
 								    GGML_ASSERT(ne13 % ne03 == 0);
 								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
 								#if 0
 								    // use cublasGemmEx
 								    {
 								        for (int i13 = 0; i13 < ne13; ++i13) {
 								            for (int i12 = 0; i12 < ne12; ++i12) {
 								                int i03 = i13 / r3;
 								                int i02 = i12 / r2;
 								                CUBLAS_CHECK(
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                            ne01, ne11, ne10,
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
 								                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
 								                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
 								                            cu_compute_type,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								            }
 								        }
 								    }
 								#else
 								    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
 								        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
 								        // use cublasGemmStridedBatchedEx
 								        CUBLAS_CHECK(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                ne01, ne11, ne10,
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
 								                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
 								                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                ne12*ne13,
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								                cu_compute_type,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    } else {
 								        // use cublasGemmBatchedEx
 								        const int ne23 = ne12*ne13;
-												cuda : fix const ptrs warning causing ROCm build issues (#3913)


											
										
										
											2023-11-02 19:32:11 +01:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
 								        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
 								        dim3 block_dims(ne13, ne12);
 								        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								                src0_f16, src1_f16, dst_t,
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								                ptrs_src.get(), ptrs_dst.get(),
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
+								                ne12, ne13,
 								                ne23,
 								                nb02, nb03,
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
 								                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								                nbd2, nbd3,
-												ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891)

* ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel

* fix warnings
											
										
										
											2023-11-01 23:10:09 +01:00
+								                r2, r3);
 								        CUDA_CHECK(cudaGetLastError());
-												cuda : revert CUDA pool stuff (#3944)

* Revert "cuda : add ROCM aliases for CUDA pool stuff (#3918)"

This reverts commit 629f917cd6b96ba1274c49a8aab163b1b189229d.

* Revert "cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903)"

This reverts commit d6069051de7165a4e06662c89257f5d2905bb156.

ggml-ci
											
										
										
											2023-11-05 08:12:13 +01:00
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								        CUBLAS_CHECK(
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                ne01, ne11, ne10,
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
 								                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                ne23,
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								                cu_compute_type,
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    }
 								#endif
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
 								        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-												ggml : fix some mul mat cases + add tests for src1 F16 (ggml/669)

* fixed mul-mat error for old GPUs

* style fixes

* add mul mat src1 f16 test cases, fix more cases

ggml-ci

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2023-12-29 09:32:31 +01:00
+								        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
-												llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-18 18:27:47 +01:00
+								    }
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
+								    const bool all_on_device =
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
 								        (src1->backend == GGML_BACKEND_TYPE_GPU) &&
 								        ( dst->backend == GGML_BACKEND_TYPE_GPU);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-												ggml-cuda : fix f16 mul mat (#3961)

* ggml-cuda : fix f16 mul mat

ggml-ci

* silence common.cpp warning (bonus)
											
										
										
											2023-11-05 18:45:16 +01:00
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    int64_t min_compute_capability = INT_MAX;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    bool any_pascal_with_slow_fp16 = false;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    if (split) {
 								        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
 								        auto & tensor_split = buft_ctx->tensor_split;
 								        for (int id = 0; id < g_device_count; ++id) {
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								            // skip devices that are not going to do any work:
 								            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
 								                continue;
 								            }
 								            if (min_compute_capability > g_device_caps[id].cc) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								                min_compute_capability = g_device_caps[id].cc;
 								            }
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								            if (g_device_caps[id].cc == 610) {
 								                any_pascal_with_slow_fp16 = true;
 								            }
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								        }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    } else {
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        min_compute_capability    = g_device_caps[g_main_device].cc;
 								        any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    }
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    // check data types and tensor shapes for custom matrix multiplication kernels:
 								    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
 								        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
 								    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
 								        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
 								    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
 								        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
+								#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
+								    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
-												CUDA: fixed tensor cores not being used on RDNA3 (#4697)


											
										
										
											2023-12-30 13:52:01 +01:00
+								#ifdef CUDA_USE_TENSOR_CORES
 								    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
 								#endif // CUDA_USE_TENSOR_CORES
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
 								#else
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
 								    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
 								    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
 								    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
 								    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
+								#ifdef CUDA_USE_TENSOR_CORES
 								    // when tensor cores are available, use them for large batch size
 								    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
+								#endif // CUDA_USE_TENSOR_CORES
 								#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-												cuda : improve text-generation and batched decoding performance (#3776)

* cuda : prints wip

* cuda : new cublas gemm branch for multi-batch quantized src0

* cuda : add F32 sgemm branch

* cuda : fine-tune >= VOLTA params + use MMQ only for small batches

* cuda : remove duplicated cuBLAS GEMM code

* cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

* build : add compile option to force use of MMQ kernels
											
										
										
											2023-10-27 16:01:23 +02:00
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								    // if mmvq is available it's a better choice than dmmv:
 								#ifndef GGML_CUDA_FORCE_DMMV
 								    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 								#endif // GGML_CUDA_FORCE_DMMV
-												SOTA 2-bit quants (#4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-08 16:02:32 +01:00
-												cuda : add batched cuBLAS GEMM for faster attention (#3749)

* cmake : add helper for faster CUDA builds

* batched : add NGL arg

* ggml : skip nops in compute_forward

* cuda : minor indentation

* cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops)

* Apply suggestions from code review

These changes plus:

```c++
#define cublasGemmBatchedEx hipblasGemmBatchedEx
```

are needed to compile with ROCM. I haven't done performance testing, but it seems to work.

I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up.

* cuda : add ROCm / hipBLAS cublasGemmBatchedEx define

* cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases

* cuda : reduce mallocs in cublasGemmBatchedEx branch

* cuda : add TODO for calling cublas from kernel + using mem pool

---------

Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
											
										
										
											2023-10-24 15:48:37 +02:00
+								    // debug helpers
 								    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
 								    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
 								    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
 								    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
 								    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
 								    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
+								    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
+								        // KQ single-batch
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-												CUDA: fix tensor core logic for Pascal and HIP (#4682)


											
										
										
											2023-12-29 23:12:53 +01:00
+								    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
+								        // KQV single-batch
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-												batched-bench : print params at start

											
										
										
											2023-10-25 09:26:27 +02:00
+								        // KQ + KQV multi-batch
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
 								    } else if (use_dequantize_mul_mat_vec) {
 								        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
 								    } else if (use_mul_mat_vec_q) {
 								        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
 								    } else if (use_mul_mat_q) {
 								        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    } else {
-												CUDA: mul_mat_vec_q tiling, refactor mul mat logic (#5434)

* CUDA: mul_mat_vec_q tiling, refactor mul mat logic

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-02-11 19:08:39 +01:00
+								        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    }
 								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								#if 0
 								template<typename ... Srcs>
 								static __global__ void k_compute_batched_ptrs_id(
 								        const void ** ptrs_src, void ** ptrs_dst,
 								        int ne12, int ne13,
 								        int ne23,
 								        int nb02, int nb03,
 								        int nb12, int nb13,
 								        int nb2, int nb3,
 								        int r2, int r3,
 								        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
 								        const half * src1_f16, half * dst_f16,
 								        const int32_t * ids, const int id,
 								        Srcs... src0s) {
 								    int i = ids[id];
 								    half * src0_f16;
 								    const void * srcs_ar[] = { (const half *) src0s... };
 								    if (src0_type == GGML_TYPE_F16) {
 								        src0_f16 = (half *) srcs_ar[i];
 								    } else {
 								        src0_f16 = src0_as_f16;
 								        if (threadIdx.x == 0 && threadIdx.y == 0) {
 								            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
 								            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
 								        }
 								    }
 								    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
 								    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
 								    if (i13 >= ne13 || i12 >= ne12) {
 								        return;
 								    }
 								    int i03 = i13 / r3;
 								    int i02 = i12 / r2;
 								    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
 								    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
 								    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
 								}
 								static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
 								    const struct ggml_tensor * ids = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
 								    const struct ggml_tensor * src00 = dst->src[2];
 								    const int id = dst->op_params[0];
 								    GGML_ASSERT(!ggml_is_transposed(src00));
 								    GGML_ASSERT(!ggml_is_transposed(src1));
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
 								    const int64_t ne01 = src00->ne[1];
 								    const int64_t ne02 = src00->ne[2];
 								    const int64_t ne03 = src00->ne[3];
 								    //const int64_t nb01 = src00->nb[1];
 								    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
 								    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
 								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
 								    const int64_t ne13 = src1->ne[3];
 								    //const int64_t nb11 = src1->nb[1];
 								    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
 								    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
 								    const int64_t ne1 = ggml_nelements(src1);
 								    const int64_t ne  = ggml_nelements(dst);
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(g_main_device);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 								    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
 								    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 								    //void * src0_ddq = src0_extra->data_device[g_main_device];
 								    //half * src0_as_f16 = (half *) src0_ddq;
 								    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 								    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 								    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
 								    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 								    // convert src1 to fp16
 								    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
 								    GGML_ASSERT(to_fp16_cuda != nullptr);
 								    size_t src1_as = 0;
 								    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
 								    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
 								    size_t dst_as = 0;
 								    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
 								    GGML_ASSERT(ne12 % ne02 == 0);
 								    GGML_ASSERT(ne13 % ne03 == 0);
 								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
 								    const half alpha_f16 = 1.0f;
 								    const half beta_f16  = 0.0f;
 								    // use cublasGemmBatchedEx
 								    const int ne23 = ne12*ne13;
 								    const void ** ptrs_src = nullptr;
 								          void ** ptrs_dst = nullptr;
 								    size_t ptrs_src_s = 0;
 								    size_t ptrs_dst_s = 0;
 								    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
 								    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
 								    int64_t src0_ne = ggml_nelements(src00);
 								    half * src0_as_f16 = nullptr;
 								    size_t src0_as = 0;
 								    if (src00->type != GGML_TYPE_F16) {
 								        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
 								    }
 								    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
 								    dim3 block_dims(ne13, ne12);
 								    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
 								            ptrs_src, ptrs_dst,
 								            ne12, ne13,
 								            ne23,
 								            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
 								            nb12, nb13,
 								            dst->nb[2], dst->nb[3],
 								            r2, r3,
 								            src00->type, src0_as_f16, src0_ne,
 								            src1_as_f16, dst_f16,
 								            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
 								            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
 								            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
 								    );
 								    CUDA_CHECK(cudaGetLastError());
 								    CUBLAS_CHECK(
 								    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
 								            ne01, ne11, ne10,
 								            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
 								                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
 								            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
 								            ne23,
 								            CUBLAS_COMPUTE_16F,
 								            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 								    if (src0_as != 0) {
 								        ggml_cuda_pool_free(src0_as_f16, src0_as);
 								    }
 								    if (ptrs_src_s != 0) {
 								        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
 								    }
 								    if (ptrs_dst_s != 0) {
 								        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
 								    }
 								    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
 								    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
 								    ggml_cuda_pool_free(src1_as_f16, src1_as);
 								    ggml_cuda_pool_free(dst_f16, dst_as);
 								}
 								#endif
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								#if 0
 								    ggml_cuda_mul_mat_id_cublas(dst);
 								    // TODO: mmq/mmv support
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								#endif
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												cuda : fix tensor size calculation for non-split buffer (#5145)


											
										
										
											2024-01-26 18:59:43 +01:00
+								    const size_t nb11 = src1->nb[1];
 								    const size_t nb1  =  dst->nb[1];
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    const struct ggml_tensor * ids = src0;
 								    const int32_t id = ((int32_t *) dst->op_params)[0];
 								    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    std::vector<char> ids_host(ggml_nbytes(ids));
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    cudaStream_t stream = g_cudaStreams[g_main_device][0];
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
 								        CUDA_CHECK(cudaStreamSynchronize(stream));
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    } else {
 								        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
 								    }
 								    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
 								    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
 								    ggml_tensor_extra_gpu src1_row_extra;
 								    ggml_tensor_extra_gpu dst_row_extra;
 								    ggml_tensor src1_row = *src1;
 								    ggml_tensor dst_row = *dst;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    src1_row.backend = GGML_BACKEND_TYPE_GPU;
 								    dst_row.backend  = GGML_BACKEND_TYPE_GPU;
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								    src1_row.extra = &src1_row_extra;
 								    dst_row.extra = &dst_row_extra;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								    if (src1->ne[1] == 1) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
 								        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								            //int32_t row_id;
 								            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
 								            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
 								            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
 								            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
 								        }
 								    } else {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
 								        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
 								        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-												Fix CudaMemcpy direction (#4599)


											
										
										
											2023-12-22 14:34:05 +01:00
+								            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
 								            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
 								            int64_t num_src1_rows = 0;
 								            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 								                if (row_id_i != row_id) {
 								                    continue;
 								                }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								                                        nb11, src1_kind, stream));
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								                num_src1_rows++;
 								            }
 								            if (num_src1_rows == 0) {
 								                continue;
 								            }
 								            src1_row.ne[1] = num_src1_rows;
 								            dst_row.ne[1] = num_src1_rows;
 								            src1_row.nb[1] = nb11;
 								            src1_row.nb[2] = num_src1_rows*nb11;
 								            src1_row.nb[3] = num_src1_rows*nb11;
 								            dst_row.nb[1] = nb1;
 								            dst_row.nb[2] = num_src1_rows*nb1;
 								            dst_row.nb[3] = num_src1_rows*nb1;
 								            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
 								            num_src1_rows = 0;
 								            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 								                if (row_id_i != row_id) {
 								                    continue;
 								                }
 								                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								                                        nb1, dst_kind, stream));
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								                num_src1_rows++;
 								            }
 								        }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								    }
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								        CUDA_CHECK(cudaStreamSynchronize(stream));
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    const int64_t ne = ggml_nelements(src0);
 								    GGML_ASSERT(ne == ggml_nelements(src1));
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
 								    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
 								    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int64_t ne02 = src0->ne[2];
-												sync : ggml (#0)

											
										
										
											2024-01-30 15:21:57 +01:00
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    //GGML_ASSERT(src0->ne[3] == 1);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int64_t nb00 = src0->nb[0];
 								    const int64_t nb01 = src0->nb[1];
 								    const int64_t nb02 = src0->nb[2];
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int64_t nb03 = src0->nb[3];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int64_t ne12 = src1->ne[2];
 								    //GGML_ASSERT(src1->ne[3] == 1);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    const int64_t nb10 = src1->nb[0];
 								    const int64_t nb11 = src1->nb[1];
 								    const int64_t nb12 = src1->nb[2];
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								    const int64_t nb13 = src1->nb[3];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												cuda : fix vmm pool with multi GPU (#4620)

* cuda : fix vmm pool with multi GPU

* hip

* use recommended granularity instead of minimum

* better error checking

* fix mixtral

* use cudaMemcpy3DPeerAsync

* use cuda_pool_alloc in ggml_cuda_op_mul_mat

* consolidate error checking in ggml_cuda_set_device

* remove unnecessary inlines

ggml-ci

* style fixes

* only use vmm for the main device

* fix scratch buffer size, re-enable vmm pool for all devices

* remove unnecessary check id != g_main_device
											
										
										
											2023-12-26 21:23:59 +01:00
+								    ggml_cuda_set_device(g_main_device);
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 								    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
 								    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
 								    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
 								    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
 								    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
 								        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    } else {
-												llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params

ggml-ci

* fix metal build

* fix freq_base/scale default to model value

* llama-bench : keep the same model between tests when possible

* move n_threads to llama_context_params, add n_threads_batch

* fix mpi build

* remove kv_size(), cuda scratch fixes

* remove low-vram option

* add n_threads_batch to system info, refactor to get_system_info()

* add documentation about --threads-batch to the READMEs

* llama-bench fix

* main : fix rope freq/scale warning

* llama.cpp : add llama_get_model
common : add llama_tokenize from model

* remove duplicated ctx/model functions

ggml-ci

* cuda : print total VRAM used
											
										
										
											2023-09-28 21:42:38 +02:00
+								        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
 								                ggml_type_name(src0->type), ggml_type_name(src1->type));
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        GGML_ASSERT(false);
 								    }
 								    (void) dst;
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												llama : per-layer KV cache + quantum K cache (#4309)

* per-layer KV

* remove unnecessary copies

* less code duplication, offload k and v separately

* llama : offload KV cache per-layer

* llama : offload K shift tensors

* llama : offload for rest of the model arches

* llama : enable offload debug temporarily

* llama : keep the KV related layers on the device

* llama : remove mirrors, perform Device -> Host when partial offload

* common : add command-line arg to disable KV cache offloading

* llama : update session save/load

* llama : support quantum K cache (#4312)

* llama : support quantum K cache (wip)

* metal : add F32 -> Q8_0 copy kernel

* cuda : add F32 -> Q8_0 copy kernel

ggml-ci

* cuda : use mmv kernel for quantum cache ops

* llama : pass KV cache type through API

* llama : fix build

ggml-ci

* metal : add F32 -> Q4_0 copy kernel

* metal : add F32 -> Q4_1 copy kernel

* cuda : wip

* cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels

* llama-bench : support type_k/type_v

* metal : use mm kernel only for quantum KV cache

* cuda : add comment

* llama : remove memory_f16 and kv_f16 flags

---------

Co-authored-by: slaren <slarengh@gmail.com>

* readme : add API change notice

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 12:03:17 +01:00
+								    // TODO: why do we pass dst as src1 here?
-												Support dup & cont ops on CUDA (#2242)


											
										
										
											2023-07-17 19:39:29 +02:00
+								    ggml_cuda_cpy(src0, dst, nullptr);
 								    (void) src1;
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												falcon : fix CUDA inference by making K and Q contiguous (#2830)

* falcon : fix CUDA inference by making K and Q contiguous

ggml-ci

* cuda : add assert to guard from non-cont ropes
											
										
										
											2023-08-27 15:40:48 +02:00
+								    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								}
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												CUDA: lower GPU latency + fix Windows performance (#3110)


											
										
										
											2023-09-11 19:55:51 +02:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								}
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
 								}
-												ggml-cuda : increase max graph size (#4084)


											
										
										
											2023-11-15 13:58:13 +01:00
+								static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
 								}
 								static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
 								}
-												llama : custom attention mask + parallel decoding + no context swaps (#3228)

* tests : verify that RoPE is "additive"

* llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)

* ggml : ggml_rope now takes a vector with positions instead of n_past

* metal : add rope_f16 kernel + optimize cpy kernels

* llama : unified KV cache + batch inference API

* llama : add new llama_decode() API that works with llama_batch

* llama : add cell_max heuristic for more efficient kv_cache

* llama : extend llama_kv_cache API

* llama : more robust cell_max heuristic + wip shift

* metal : disable concurrency optimization

* llama : add llama_kv_cache_shift_seq + no more context swaps

* llama : apply K-cache roping for Falcon and Baichuan

* speculative : fix KV cache management

* parallel : example for serving multiple users in parallel

* parallel : disable hot-plug to avoid cache fragmentation

* fixes : speculative KV cache + llama worst-case graph

* llama : extend batch API to select which logits to output

* llama : fix worst case graph build

* ggml-cuda : update rope implementation for parallel decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make : add parallel to build + fix static functions in llama.cpp

* simple : fix token counting

* parallel : various improvements

* llama : fix cell_max logic + rename functions

* parallel : try smaller batches when the KV cache is fragmented

* parallel : fix sequence termination criteria

* llama : silence errors KV cache errors

* parallel : remove new line from prompt

* parallel : process system prompt once + configurable paramters + llama API

* parallel : remove question with short answers

* parallel : count cache misses

* parallel : print misses on each request

* parallel : minor

* llama : fix n_kv to never become 0

* parallel : rename hot-plug to continuous-batching

* llama : improve llama_batch API + simplify parallel example

* simple : add parallel decoding support

* simple : improve comments + free batch

* ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : disable MPI for now

ggml-ci

* train : make KQ_pos memory buffer permanent via dummy scale op

* ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275)

ggml-ci

* parallel : fix bug (extra BOS) + smaller token_prev array

* parallel : fix cases where the input prompts can overflow the batch

* parallel : add disabled experimental batch chunking in powers of two

* llama : llama.h formatting + comments

* simple : add README.md

* llama : fix kv cache heuristic when context is less than 32

* parallel : fix crash when `-n -1`

* llama : simplify returns if/else branches

* metal : use mm kernels for batch size > 2

* examples : utilize new llama_get_logits_ith()

* examples : add example for batched decoding

* examples : do not eval prompt 2 times (close #3348)

* server : clear the KV cache beyond n_past before llama_decode

* server : avoid context swaps by shifting the KV cache

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-09-28 18:04:36 +02:00
+								static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    (void) src0;
 								    (void) src1;
 								    (void) dst;
-												cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
											
										
										
											2023-05-01 18:11:07 +02:00
+								}
-												ggml : use ggml_row_size where possible (#4472)

* ggml : use ggml_row_size where possible

ggml-ci

* ggml : move ggml_nbytes_split to ggml-cuda.cu
											
										
										
											2023-12-14 20:05:21 +01:00
+								static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
 								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
-												Fix the validation of main device (#1872)


											
										
										
											2023-06-15 19:29:59 +02:00
+								    if (main_device >= g_device_count) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
 								                main_device, g_device_count, g_main_device);
 								        return;
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    if (g_main_device != main_device && g_device_count > 1) {
 								        g_main_device = main_device;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        //cudaDeviceProp prop;
 								        //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
 								        //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								    }
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (!g_cublas_loaded) return false;
-												cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946)

* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build

* doc: add comments to ggml_cublas_loaded()

* fix defined(...)
											
										
										
											2023-11-07 07:49:08 +01:00
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    ggml_cuda_func_t func;
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
 								        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
 								        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
-												CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)


											
										
										
											2023-12-21 18:42:59 +01:00
+								    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        return false;
 								    }
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								    if (tensor->op == GGML_OP_MUL_MAT) {
 								        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
 								#ifndef NDEBUG
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								#endif
 								            return false;
 								        }
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    switch (tensor->op) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        case GGML_OP_REPEAT:
 								            func = ggml_cuda_repeat;
 								            break;
 								        case GGML_OP_GET_ROWS:
 								            func = ggml_cuda_get_rows;
 								            break;
-												Support dup & cont ops on CUDA (#2242)


											
										
										
											2023-07-17 19:39:29 +02:00
+								        case GGML_OP_DUP:
 								            func = ggml_cuda_dup;
 								            break;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        case GGML_OP_ADD:
 								            func = ggml_cuda_add;
 								            break;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_ACC:
 								            func = ggml_cuda_acc;
 								            break;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        case GGML_OP_MUL:
 								            func = ggml_cuda_mul;
 								            break;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_OP_DIV:
 								            func = ggml_cuda_div;
 								            break;
-												ggml : sync (unary ops refactor, static-correctness) (#2370)

* ggml : sync (unary ops, tests)

ggml-ci

* tests : remove unnecessary funcs
											
										
										
											2023-07-24 13:46:21 +02:00
+								        case GGML_OP_UNARY:
 								            switch (ggml_get_unary_op(tensor)) {
 								                case GGML_UNARY_OP_GELU:
 								                    func = ggml_cuda_gelu;
 								                    break;
 								                case GGML_UNARY_OP_SILU:
 								                    func = ggml_cuda_silu;
 								                    break;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								                case GGML_UNARY_OP_GELU_QUICK:
 								                    func = ggml_cuda_gelu_quick;
 								                    break;
 								                case GGML_UNARY_OP_TANH:
 								                    func = ggml_cuda_tanh;
 								                    break;
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								                case GGML_UNARY_OP_RELU:
 								                    func = ggml_cuda_relu;
 								                    break;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								                case GGML_UNARY_OP_HARDSIGMOID:
 								                    func = ggml_cuda_hardsigmoid;
 								                    break;
 								                case GGML_UNARY_OP_HARDSWISH:
 								                    func = ggml_cuda_hardswish;
 								                    break;
-												ggml : sync (unary ops refactor, static-correctness) (#2370)

* ggml : sync (unary ops, tests)

ggml-ci

* tests : remove unnecessary funcs
											
										
										
											2023-07-24 13:46:21 +02:00
+								                default:
 								                    return false;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            }
 								            break;
-												ggml : sync (abort callback, mul / add broadcast, fix alibi) (#2183)


											
										
										
											2023-07-11 21:53:34 +02:00
+								        case GGML_OP_NORM:
 								            func = ggml_cuda_norm;
 								            break;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_GROUP_NORM:
 								            func = ggml_cuda_group_norm;
 								            break;
 								        case GGML_OP_CONCAT:
 								            func = ggml_cuda_concat;
 								            break;
 								        case GGML_OP_UPSCALE:
 								            func = ggml_cuda_upscale;
 								            break;
 								        case GGML_OP_PAD:
 								            func = ggml_cuda_pad;
 								            break;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        case GGML_OP_ARANGE:
 								            func = ggml_cuda_arange;
 								            break;
 								        case GGML_OP_TIMESTEP_EMBEDDING:
 								            func = ggml_cuda_timestep_embedding;
 								            break;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_LEAKY_RELU:
 								            func = ggml_cuda_leaky_relu;
 								            break;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        case GGML_OP_RMS_NORM:
 								            func = ggml_cuda_rms_norm;
 								            break;
 								        case GGML_OP_MUL_MAT:
-												ggml : remove src0 and src1 from ggml_tensor and rename opt to src (#2178)

* Add ggml changes

* Update train-text-from-scratch for change

* mpi : adapt to new ggml_tensor->src

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-07-11 18:31:10 +02:00
+								            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								                return false;
 								            }
 								            func = ggml_cuda_mul_mat;
 								            break;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_OP_MUL_MAT_ID:
 								            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
 								                return false;
 								            }
 								            func = ggml_cuda_mul_mat_id;
 								            break;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        case GGML_OP_SCALE:
 								            func = ggml_cuda_scale;
 								            break;
-												Add ReLU and SQR CUDA ops to (partially) fix Persimmon offloading (#4041)

* Add ReLU and SQR CUDA ops to fix Persimmon offloading

* Persimmon loader: More helpful error on CUDA/ROCM when offloading too many layers
											
										
										
											2023-11-13 09:58:15 +01:00
+								        case GGML_OP_SQR:
 								            func = ggml_cuda_sqr;
 								            break;
-												llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-10-10 09:50:23 +02:00
+								        case GGML_OP_CLAMP:
 								            func = ggml_cuda_clamp;
 								            break;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        case GGML_OP_CPY:
 								            func = ggml_cuda_cpy;
 								            break;
-												Support dup & cont ops on CUDA (#2242)


											
										
										
											2023-07-17 19:39:29 +02:00
+								        case GGML_OP_CONT:
 								            func = ggml_cuda_dup;
 								            break;
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_NONE:
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        case GGML_OP_RESHAPE:
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        case GGML_OP_VIEW:
 								        case GGML_OP_PERMUTE:
 								        case GGML_OP_TRANSPOSE:
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								            func = ggml_cuda_nop;
 								            break;
-												CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
											
										
										
											2023-06-14 19:47:19 +02:00
+								        case GGML_OP_DIAG_MASK_INF:
 								            func = ggml_cuda_diag_mask_inf;
 								            break;
 								        case GGML_OP_SOFT_MAX:
 								            func = ggml_cuda_soft_max;
 								            break;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        case GGML_OP_ROPE:
 								            func = ggml_cuda_rope;
 								            break;
-												ggml : sync latest (SAM + SD operators, CUDA alibi) (#2709)

* ggml : sync latest (SAM + SD operators, CUDA alibi)

ggml-ci

* ggml : fix tabs
											
										
										
											2023-08-22 13:22:08 +02:00
+								        case GGML_OP_ALIBI:
 								            func = ggml_cuda_alibi;
 								            break;
-												ggml : sync (im2col, GPU conv, 32-bit arm compat) (#4060)

ggml-ci
											
										
										
											2023-11-13 15:55:52 +01:00
+								        case GGML_OP_IM2COL:
 								            func = ggml_cuda_im2col;
 								            break;
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								        case GGML_OP_POOL_2D:
 								            func = ggml_cuda_pool2d;
 								            break;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_OP_SUM_ROWS:
 								            func = ggml_cuda_sum_rows;
 								            break;
 								        case GGML_OP_ARGSORT:
 								            func = ggml_cuda_argsort;
 								            break;
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        default:
 								            return false;
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-												CUDA: Faster Mixtral prompt processing (#4538)

* CUDA: make MoE tensors contiguous for batch size>1

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-20 15:41:22 +01:00
+								        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
 								    }
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    if (params->ith != 0) {
 								        return true;
 								    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								        return true;
 								    }
-												ggml : remove src0 and src1 from ggml_tensor and rename opt to src (#2178)

* Add ggml changes

* Update train-text-from-scratch for change

* mpi : adapt to new ggml_tensor->src

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-07-11 18:31:10 +02:00
+								    func(tensor->src[0], tensor->src[1], tensor);
-												Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
											
										
										
											2023-06-06 21:33:23 +02:00
+								    return true;
-												cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)

* Broadcasting for ggml_mul

* CUDA kernel for ggml_mul, norms in VRAM

* GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* fixup! GPU weights not in RAM, direct loading with cuFile

* define default model path once, sync path with readme (#1366)

* ~7% faster Q5_1 AVX2 code (#1477)

* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)

* Support models in a single pytorch_model.bin

* Remove spurious line with typo

* benchmark-matmul: Print the average of the test results (#1490)

* Remove unused n_parts parameter (#1509)

* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)

* Fix for w64devkit and mingw

* make kv_f16 the default for api users (#1517)

* minor : fix compile warnings

* readme : adds WizardLM to the list of supported models (#1485)

* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)

* Make reverse prompt option act as a stop token in non-interactive scenarios

* Making requested review changes

* Update gpt_params_parse and fix a merge error

* Revert "Update gpt_params_parse and fix a merge error"

This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.

* Update gpt_params_parse and fix a merge error take 2

* examples : add persistent chat (#1495)

* examples : add persistent chat

* examples : fix whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* tests : add missing header

* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)

* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0

* llama : bump LLAMA_FILE_VERSION to 3

* cuda : update Q4 and Q8 dequantize kernels

* ggml : fix AVX dot products

* readme : update performance table + hot topics

* ggml : fix scalar implementation of Q4_1 dot

* llama : fix compile warnings in llama_set_state_data()

* llama : fix name shadowing and C4146 (#1526)

* Fix name shadowing and C4146

* Fix if macros not using defined when required

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update llama-util.h

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Code style

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix for mingw (#1462)

* llama : add llama_init_backend() API (close #1527)

* feature : add blis and other BLAS implementation support (#1502)

* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Revert "feature : add blis and other BLAS implementation support (#1502)"

This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.

* GPU weights not in RAM, direct loading with cuFile

* llama : code style fixes + progress print fix

* ggml : ggml_mul better broadcast support

* cmake : workarounds for cufile when CMake version < 3.25

* gg rebase fixup

* Loop in llama.cpp, fixed progress callback

* Attempt clang-tidy fix

* llama : fix vram size computation

* Add forgotten fclose()

---------

Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
											
										
										
											2023-05-20 14:19:28 +02:00
+								}
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 12:44:58 +02:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL int ggml_cuda_get_device_count() {
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 12:44:58 +02:00
+								    int device_count;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
 								        return 0;
 								    }
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 12:44:58 +02:00
+								    return device_count;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
-												llama : add benchmark example (#2626)

* llama : add benchmark example

* add to examples CMakeLists.txt

* fix msvc build

* add missing include

* add Bessel's correction to stdev calculation

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* improve markdown formatting

* add missing include

* print warning is NDEBUG is not defined

* remove n_prompt and n_gen from the matrix, use each value separately instead

* better checks for non-optimized builds

* llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

* fix json formatting

* add sql output

* add basic cpu and gpu info (linx/cuda only)

* markdown: also show values that differ from the default

* markdown: add build id

* cleanup

* improve formatting

* formatting

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-08-18 12:44:58 +02:00
+								    cudaDeviceProp prop;
 								    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
 								    snprintf(description, description_size, "%s", prop.name);
 								}
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								////////////////////////////////////////////////////////////////////////////////
 								// backend interface
 								#define UNUSED GGML_UNUSED
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								struct ggml_backend_cuda_context {
 								    int device;
 								    std::string name;
 								};
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								// cuda buffer
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								struct ggml_backend_cuda_buffer_context {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    int device;
 								    void * dev_ptr = nullptr;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
 								    size_t temp_tensor_extra_index = 0;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    std::string name;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
 								        device(device), dev_ptr(dev_ptr),
 								        name(GGML_CUDA_NAME + std::to_string(device)) {
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ~ggml_backend_cuda_buffer_context() {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        delete[] temp_tensor_extras;
 								    }
 								    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        if (temp_tensor_extras == nullptr) {
-												ggml-cuda : increase max graph size (#4084)


											
										
										
											2023-11-15 13:58:13 +01:00
+								            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        }
 								        size_t alloc_index = temp_tensor_extra_index;
-												ggml-cuda : increase max graph size (#4084)


											
										
										
											2023-11-15 13:58:13 +01:00
+								        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
 								        memset(extra, 0, sizeof(*extra));
 								        return extra;
 								    }
 								};
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								    return ctx->name.c_str();
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    CUDA_CHECK(cudaFree(ctx->dev_ptr));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    delete ctx;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    return ctx->dev_ptr;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								        assert(tensor->view_src->buffer->buft == buffer->buft);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        tensor->backend = tensor->view_src->backend;
 								        tensor->extra = tensor->view_src->extra;
 								        return;
 								    }
 								    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    extra->data_device[ctx->device] = tensor->data;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    tensor->backend = GGML_BACKEND_TYPE_GPU;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    tensor->extra = extra;
 								    if (ggml_is_quantized(tensor->type)) {
 								        // initialize padding to 0 to avoid possible NaN values
-												cuda : fix tensor size calculation for non-split buffer (#5145)


											
										
										
											2024-01-26 18:59:43 +01:00
+								        size_t original_size = ggml_nbytes(tensor);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								        if (padded_size > original_size && tensor->view_src == nullptr) {
-												cuda : fix tensor size calculation for non-split buffer (#5145)


											
										
										
											2024-01-26 18:59:43 +01:00
+								            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        }
 								    }
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaDeviceSynchronize());
 								    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
-												fix : cuda order of synchronization when setting a buffer (ggml/679)

* fix : cuda order of synchronization when setting a buffer

* also sync before memcpy

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-05 16:00:00 +01:00
+								    CUDA_CHECK(cudaDeviceSynchronize());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
 								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaDeviceSynchronize());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    CUDA_CHECK(cudaDeviceSynchronize());
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    if (ggml_backend_buffer_is_cuda(src->buffer)) {
 								        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
 								        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 								        ggml_cuda_set_device(src_ctx->device);
 								        CUDA_CHECK(cudaDeviceSynchronize());
 								        ggml_cuda_set_device(dst_ctx->device);
 								        CUDA_CHECK(cudaDeviceSynchronize());
 								        CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
 								        CUDA_CHECK(cudaDeviceSynchronize());
 								        return true;
 								    }
 								    return false;
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
 								    ggml_cuda_set_device(ctx->device);
 								    CUDA_CHECK(cudaDeviceSynchronize());
 								    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    CUDA_CHECK(cudaDeviceSynchronize());
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 								    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
 								    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
 								    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
 								    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
 								    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								    /* .clear           = */ ggml_backend_cuda_buffer_clear,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .reset           = */ NULL,
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								};
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								// cuda buffer type
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								struct ggml_backend_cuda_buffer_type_context {
 								    int device;
 								    std::string name;
 								};
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 								    return ctx->name.c_str();
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_cuda_set_device(buft_ctx->device);
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 13:16:23 +01:00
 								    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    void * dev_ptr;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    cudaError_t err = cudaMalloc(&dev_ptr, size);
 								    if (err != cudaSuccess) {
 								        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
 								        return nullptr;
 								    }
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 13:16:23 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    return 128;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-												cuda : fix tensor size calculation for non-split buffer (#5145)


											
										
										
											2024-01-26 18:59:43 +01:00
+								    size_t size = ggml_nbytes(tensor);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    int64_t ne0 = tensor->ne[0];
 								    if (ggml_is_quantized(tensor->type)) {
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
-												ggml : use ggml_row_size where possible (#4472)

* ggml : use ggml_row_size where possible

ggml-ci

* ggml : move ggml_nbytes_split to ggml-cuda.cu
											
										
										
											2023-12-14 20:05:21 +01:00
+								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        }
 								    }
 								    return size;
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    if (!ggml_backend_is_cuda(backend)) {
 								        return false;
 								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 								    return buft_ctx->device == cuda_ctx->device;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
 								    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-												ggml : add Vulkan backend (#2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-28 18:03:59 +01:00
+								    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
 								    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .is_host          = */ NULL,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								};
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // FIXME: this is not thread safe
 								    if (device >= ggml_backend_cuda_get_device_count()) {
 								        return nullptr;
 								    }
 								    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
 								    static bool ggml_backend_cuda_buffer_type_initialized = false;
 								    if (!ggml_backend_cuda_buffer_type_initialized) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								            ggml_backend_cuda_buffer_types[i] = {
 								                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            };
 								        }
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								        ggml_backend_cuda_buffer_type_initialized = true;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    }
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								    return &ggml_backend_cuda_buffer_types[device];
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								// cuda split buffer
 								struct ggml_backend_cuda_split_buffer_context {
 								    ~ggml_backend_cuda_split_buffer_context() {
 								        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
 								            for (int id = 0; id < g_device_count; ++id) {
 								                for (int64_t is = 0; is < MAX_STREAMS; ++is) {
 								                    if (extra->events[id][is] != nullptr) {
 								                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
 								                    }
 								                }
 								                if (extra->data_device[id] != nullptr) {
 								                    CUDA_CHECK(cudaFree(extra->data_device[id]));
 								                }
 								            }
 								            delete extra;
 								        }
 								    }
 								    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 								};
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return GGML_CUDA_NAME "_Split";
 								    UNUSED(buffer);
 								}
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 23:40:26 +01:00
+								static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
 								    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
 								    UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 								    delete ctx;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
 								    return (void *)0x1000;
 								    UNUSED(buffer);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 								    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 								    const int64_t ne0 = tensor->ne[0];
 								    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
 								    ctx->tensor_extras.push_back(extra);
 								    for (int id = 0; id < g_device_count; ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
 								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
 								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								        // FIXME: do not crash if cudaMalloc fails
 								        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
 								        ggml_cuda_set_device(id);
 								        char * buf;
 								        CUDA_CHECK(cudaMalloc(&buf, size));
 								        // set padding to 0 to avoid possible NaN values
 								        if (size > original_size) {
 								            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
 								        }
 								        extra->data_device[id] = buf;
 								        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
 								            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
 								        }
 								    }
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    tensor->extra = extra;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // split tensors must always be set in their entirety at once
 								    GGML_ASSERT(offset == 0);
 								    GGML_ASSERT(size == ggml_nbytes(tensor));
 								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 								    const int64_t ne0 = tensor->ne[0];
 								    const size_t nb1 = tensor->nb[1];
 								    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 								    for (int id = 0; id < g_device_count; ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
 								        const size_t offset_split = row_low*nb1;
 								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
 								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								        const char * buf_host = (const char *)data + offset_split;
 								        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
 								    }
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // split tensors must always be set in their entirety at once
 								    GGML_ASSERT(offset == 0);
 								    GGML_ASSERT(size == ggml_nbytes(tensor));
 								    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 								    const int64_t ne0 = tensor->ne[0];
 								    const size_t nb1 = tensor->nb[1];
 								    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 								    for (int id = 0; id < g_device_count; ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
 								        const size_t offset_split = row_low*nb1;
 								        size_t size = ggml_nbytes_split(tensor, nrows_split);
 								        const size_t original_size = size;
 								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								        char * buf_host = (char *)data + offset_split;
 								        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
 								    }
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    UNUSED(buffer);
 								    UNUSED(value);
 								}
 								static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 								    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
 								    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
 								    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
 								    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
 								    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
 								    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
 								    /* .cpy_tensor      = */ NULL,
 								    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
 								    /* .reset           = */ NULL,
 								};
 								// cuda split buffer type
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return GGML_CUDA_NAME "_Split";
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
 								    // instead, we allocate them for each tensor separately in init_tensor
 								    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
 								    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
 								    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
 								    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return 128;
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
 								    size_t total_size = 0;
 								    const int64_t ne0 = tensor->ne[0];
 								    for (int id = 0; id < g_device_count; ++id) {
 								        int64_t row_low, row_high;
 								        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
 								        int64_t nrows_split = row_high - row_low;
 								        if (nrows_split == 0) {
 								            continue;
 								        }
 								        total_size += ggml_nbytes_split(tensor, nrows_split);
 								        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
 								        if (ne0 % MATRIX_ROW_PADDING != 0) {
 								            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 								        }
 								    }
 								    return total_size;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return ggml_backend_is_cuda(backend);
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return false;
 								    UNUSED(buft);
 								}
 								static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
 								    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
 								    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
 								    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
-												ggml : add Vulkan backend (#2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-28 18:03:59 +01:00
+								    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
 								    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
 								    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 								};
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    // FIXME: this is not thread safe
 								    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
 								    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
 								    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
 								    if (all_zero) {
 								        tensor_split_arr = g_default_tensor_split;
 								    } else {
 								        float split_sum = 0.0f;
 								        for (int i = 0; i < g_device_count; ++i) {
 								            tensor_split_arr[i] = split_sum;
 								            split_sum += tensor_split[i];
 								        }
 								        for (int i = 0; i < g_device_count; ++i) {
 								            tensor_split_arr[i] /= split_sum;
 								        }
 								    }
 								    auto it = buft_map.find(tensor_split_arr);
 								    if (it != buft_map.end()) {
 								        return &it->second;
 								    }
 								    struct ggml_backend_buffer_type buft {
 								        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
 								        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
 								    };
 								    auto result = buft_map.emplace(tensor_split_arr, buft);
 								    return &result.first->second;
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								// host buffer type
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return GGML_CUDA_NAME "_Host";
 								    UNUSED(buft);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return GGML_CUDA_NAME "_Host";
 								    UNUSED(buffer);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 16:10:51 +01:00
+								    ggml_cuda_host_free(buffer->context);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 16:10:51 +01:00
+								    void * ptr = ggml_cuda_host_malloc(size);
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 16:10:51 +01:00
+								    if (ptr == nullptr) {
-												cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2023-12-24 14:34:22 +01:00
+								        // fallback to cpu buffer
 								        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-												fallback to CPU buffer if host buffer alloc fails (#4610)


											
										
										
											2023-12-23 16:10:51 +01:00
+								    }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
 								    buffer->buft = buft;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
 								    return buffer;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
 								        /* .iface    = */ {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
 								            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-												ggml : add Vulkan backend (#2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-28 18:03:59 +01:00
+								            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 								            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
 								            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
 								        },
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        /* .context  = */ nullptr,
 								    };
-												llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-12-21 21:07:46 +01:00
+								    return &ggml_backend_cuda_buffer_type_host;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
 								// backend
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return cuda_ctx->name.c_str();
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    delete cuda_ctx;
 								    delete backend;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
 								        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
 								        return true;
 								    }
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return false;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    UNUSED(backend);
 								}
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 10:05:42 +01:00
+								GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
 								    ggml_cuda_set_main_device(cuda_ctx->device);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    ggml_compute_params params = {};
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								    params.type = GGML_TASK_TYPE_COMPUTE;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    params.ith = 0;
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        ggml_tensor * node = cgraph->nodes[i];
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-												sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
											
										
										
											2023-11-13 13:16:23 +01:00
+								            continue;
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								#ifndef NDEBUG
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								        assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
 								        assert(node->extra != nullptr);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        for (int j = 0; j < GGML_MAX_SRC; j++) {
 								            if (node->src[j] != nullptr) {
-												code : normalize enum names (#5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont
											
										
										
											2024-02-25 11:09:09 +01:00
+								                assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
-												cuda : ignore peer access already enabled errors (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
											
										
										
											2024-02-19 23:40:26 +01:00
+								                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								                assert(node->src[j]->extra != nullptr);
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								            }
 								        }
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								#endif
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								        bool ok = ggml_cuda_compute_forward(&params, node);
 								        if (!ok) {
 								            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
 								        }
 								        GGML_ASSERT(ok);
 								    }
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 10:05:42 +01:00
+								    return GGML_STATUS_SUCCESS;
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    switch (op->op) {
 								        case GGML_OP_UNARY:
 								            switch (ggml_get_unary_op(op)) {
 								                case GGML_UNARY_OP_GELU:
 								                case GGML_UNARY_OP_SILU:
 								                case GGML_UNARY_OP_RELU:
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								                case GGML_UNARY_OP_HARDSIGMOID:
 								                case GGML_UNARY_OP_HARDSWISH:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								                case GGML_UNARY_OP_GELU_QUICK:
 								                case GGML_UNARY_OP_TANH:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								                    return true;
 								                default:
 								                    return false;
 								            }
 								            break;
 								        case GGML_OP_MUL_MAT:
 								        case GGML_OP_MUL_MAT_ID:
 								            {
 								                struct ggml_tensor * a;
 								                struct ggml_tensor * b;
 								                if (op->op == GGML_OP_MUL_MAT) {
 								                    a = op->src[0];
 								                    b = op->src[1];
 								                } else {
 								                    a = op->src[2];
 								                    b = op->src[1];
 								                }
 								                if (a->ne[3] != b->ne[3]) {
 								                    return false;
 								                }
-												ggml : add IQ2 to test-backend-ops + refactoring (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
											
										
										
											2024-01-17 17:54:56 +01:00
+								                ggml_type a_type = a->type;
-												IQ4_NL: 4-bit non-linear quants with blocks of 32 (#5590)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* iq4_nl: Fix after merging with master

* iq4_nl: another fix after merging with master

* Use IQ4_NL instead of Q4_K when using k-quants is not possible

* Fix typo that makes several tests fail

* It was the ggml_vdotq thing missed inside the brackets

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-21 10:39:52 +01:00
+								                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (#5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-02-26 17:28:38 +01:00
+								                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
-												IQ4_XS: a 4.25 bpw quantization (#5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-27 15:34:24 +01:00
+								                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-												ggml : add IQ2 to test-backend-ops + refactoring (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
											
										
										
											2024-01-17 17:54:56 +01:00
+								                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
 								                        return false;
 								                    }
 								                }
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								                return true;
 								            } break;
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								        case GGML_OP_GET_ROWS:
 								            {
 								                switch (op->src[0]->type) {
 								                    case GGML_TYPE_F16:
 								                    case GGML_TYPE_F32:
 								                    case GGML_TYPE_Q4_0:
 								                    case GGML_TYPE_Q4_1:
 								                    case GGML_TYPE_Q5_0:
 								                    case GGML_TYPE_Q5_1:
 								                    case GGML_TYPE_Q8_0:
 								                        return true;
 								                    default:
 								                        return false;
 								                }
 								            } break;
 								        case GGML_OP_CPY:
 								            {
 								                ggml_type src0_type = op->src[0]->type;
 								                ggml_type src1_type = op->src[1]->type;
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
 								                    return true;
 								                }
 								                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
 								                    return true;
 								                }
-												`ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686)

* added cuda float16->float32 upcasting to ggml_cuda_cpy

* added ability to copy 4d tensors with the cuda backend

* added tests for float16_>float32 upcast and 4d tensor cuda copys

* added 4d copy test for float32->float16 copy

* applied patch suggested by @iamlemec

* simplify cpy tests

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-29 13:37:33 +01:00
+								                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
 								                    return true;
 								                }
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
+								                return false;
 								            } break;
-												cuda : mark I16 and I32 ops as unsupported

ggml-ci

											
										
										
											2024-01-03 12:01:44 +01:00
+								        case GGML_OP_DUP:
 								        case GGML_OP_REPEAT:
 								        case GGML_OP_CONCAT:
 								            {
 								                ggml_type src0_type = op->src[0]->type;
-												cuda : simplify expression

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-03 13:18:46 +01:00
+								                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-												cuda : mark I16 and I32 ops as unsupported

ggml-ci

											
										
										
											2024-01-03 12:01:44 +01:00
+								            } break;
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_OP_NONE:
 								        case GGML_OP_RESHAPE:
 								        case GGML_OP_VIEW:
 								        case GGML_OP_PERMUTE:
 								        case GGML_OP_TRANSPOSE:
 								        case GGML_OP_NORM:
 								        case GGML_OP_ADD:
 								        case GGML_OP_MUL:
 								        case GGML_OP_DIV:
 								        case GGML_OP_RMS_NORM:
 								        case GGML_OP_SCALE:
 								        case GGML_OP_SQR:
 								        case GGML_OP_CLAMP:
 								        case GGML_OP_CONT:
 								        case GGML_OP_DIAG_MASK_INF:
 								        case GGML_OP_SOFT_MAX:
 								        case GGML_OP_ROPE:
 								        case GGML_OP_ALIBI:
 								        case GGML_OP_IM2COL:
-												llava : add MobileVLM support (#5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-01-31 14:10:15 +01:00
+								        case GGML_OP_POOL_2D:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								        case GGML_OP_SUM_ROWS:
 								        case GGML_OP_ARGSORT:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_ACC:
 								        case GGML_OP_GROUP_NORM:
 								        case GGML_OP_UPSCALE:
 								        case GGML_OP_PAD:
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 13:23:52 +01:00
+								        case GGML_OP_ARANGE:
 								        case GGML_OP_TIMESTEP_EMBEDDING:
-												sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-13 20:54:54 +01:00
+								        case GGML_OP_LEAKY_RELU:
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								            return true;
 								        default:
 								            return false;
 								    }
 								    UNUSED(backend);
 								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								static ggml_backend_i ggml_backend_cuda_interface = {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .get_name                = */ ggml_backend_cuda_name,
 								    /* .free                    = */ ggml_backend_cuda_free,
 								    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
 								    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
 								    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    /* .graph_plan_create       = */ NULL,
 								    /* .graph_plan_free         = */ NULL,
 								    /* .graph_plan_compute      = */ NULL,
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
 								    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								};
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 17:27:36 +01:00
+								static ggml_guid_t ggml_backend_cuda_guid() {
 								    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
 								    return &guid;
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								    ggml_init_cublas(); // TODO: remove from ggml.c
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    if (device < 0 || device >= ggml_cuda_get_device_count()) {
 								        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
 								        return nullptr;
 								    }
 								    // not strictly necessary, but it may reduce the overhead of the first graph_compute
 								    ggml_cuda_set_main_device(device);
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
 								        /* .device = */ device,
 								        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    };
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
 								    ggml_backend_t cuda_backend = new ggml_backend {
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 17:27:36 +01:00
+								        /* .guid      = */ ggml_backend_cuda_guid(),
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								        /* .interface = */ ggml_backend_cuda_interface,
-												sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
											
										
										
											2023-10-08 19:19:14 +02:00
+								        /* .context   = */ ctx
 								    };
 								    return cuda_backend;
 								}
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 17:27:36 +01:00
+								    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL int ggml_backend_cuda_get_device_count() {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    return ggml_cuda_get_device_count();
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_cuda_get_device_description(device, description, description_size);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								    ggml_cuda_set_device(device);
 								    CUDA_CHECK(cudaMemGetInfo(free, total));
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								}
-												llama : ggml-backend integration (#4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (#4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (#4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-01-12 20:07:38 +01:00
+								// backend registry
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
 								    return cuda_backend;
 								    UNUSED(params);
 								}
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
-												llama : add Mixtral support (#4406)

* convert : support Mixtral as LLAMA arch

* convert : fix n_ff typo

* llama : model loading

* ggml : sync latest ggml_mul_mat_id

* llama : update graph to support MoE

* llama : fix cur -> cur_expert

* llama : first working version

* llama : fix expert weighting in the FFN

* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)

* ggml : add n_as argument to ggml_mul_mat_id

* ggml : fix ggml_get_rows to take into account ne02 / ne11

* metal : add more general support for ggml_get_rows + tests

* llama : add basic support for offloading moe with CUDA

* metal : add/mul/div use general kernel when src1 not cont

* metal : reduce the kernel launches for ggml_mul_mat_id

* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D

* ggml : update get_rows f16 and q

* cuda : support non-contiguous src1 in get_rows

* llama : offload missing ffn_moe_silu

* metal : fix ggml_get_rows to work with non-cont src1

* metal : add indirect mat-vec kernels for all quantization types

* llama : do not quantize expert gating tensors

* llama : add n_expert and n_expert_used to hparams + change quants

* test-backend-ops : add moe test

* cuda : fix get_rows when ncols is odd

* convert : determine n_ctx correctly

* metal : fix ggml_mul_mat_id for F32

* test-backend-ops : make experts more evenly probable (test_moe)

* test-backend-ops : cleanup, add moe test for batches

* test-backend-ops : add cpy from f32 -> all types test

* test-backend-ops : fix dequantize block offset

* llama : fix hard-coded number of experts

* test-backend-ops : simplify and disable slow tests to avoid CI timeout

* test-backend-ops : disable MOE test with thread sanitizer

* cuda : fix mul_mat_id with multi gpu

* convert : use 1e6 rope_freq_base for mixtral

* convert : fix style

* convert : support safetensors format

* gguf-py : bump version

* metal : add cpy f16 -> f32 kernel

* metal : fix binary ops for ne10 % 4 != 0

* test-backend-ops : add one more sum_rows test

* ggml : do not use BLAS with ggml_mul_mat_id

* convert-hf : support for mixtral-instruct (#4428)

* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct

* convert : use sentencepiece tokenizer for Mixtral-instruct

* convert : make flake8 happy

* metal : fix soft_max kernels

ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92

* metal : limit kernels to not use more than the allowed threads

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
											
										
										
											2023-12-13 13:04:25 +01:00
-												ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
											
										
										
											2024-01-16 12:16:33 +01:00
+								GGML_CALL int ggml_backend_cuda_reg_devices() {
-												sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2023-12-07 21:26:54 +01:00
+								    int device_count = ggml_cuda_get_device_count();
 								    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
 								    for (int i = 0; i < device_count; i++) {
 								        char name[128];
 								        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
 								        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
 								    }
 								    return device_count;
 								}