llama.cpp/ggml-cuda/common.cuh

#pragma once

#include "ggml.h"
#include "ggml-cuda.h"

#include <memory>

#if defined(GGML_USE_HIPBLAS)
#define GGML_COMMON_DECL_HIP
#define GGML_COMMON_IMPL_HIP
#else
#define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA
#endif
#include "ggml-common.h"

#include <cstdio>
#include <array>
#include <cassert>
#include <cfloat>
#include <string>

#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
#else
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#endif
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define __trap abort
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#else
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020

#endif // defined(GGML_USE_HIPBLAS)

#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

#define WARP_SIZE 32
#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

#define CC_PASCAL     600
#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA      700
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
#define CC_RDNA3      (CC_OFFSET_AMD + 1100)

// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// -  7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ

// TODO: improve this to be correct for more hardware
//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif

#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available

#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#define GGML_CUDA_MAX_STREAMS 8

[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
     do {                                                                           \
        auto err_ = (err);                                                          \
        if (err_ != (success)) {                                                    \
            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
        }                                                                           \
    } while (0)

#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)

#if CUDART_VERSION >= 12000
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        return cublasGetStatusString(err);
    }
#else
    static const char * cublas_get_error_str(const cublasStatus_t err) {
        switch (err) {
            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
            default: return "unknown error";
        }
    }
#endif // CUDART_VERSION >= 12000

#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)

#if !defined(GGML_USE_HIPBLAS)
static const char * cu_get_error_str(CUresult err) {
    const char * err_str;
    cuGetErrorString(err, &err_str);
    return err_str;
}
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif

#if CUDART_VERSION >= 11100
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100

#ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float
typedef half2 dfloat2;
#else
typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16

[[noreturn]]
static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    GGML_UNUSED(arch_list);
#else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning
}

#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__

static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
}

static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
}

#ifdef GGML_CUDA_F16
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
   }
   return a;
#else
   GGML_UNUSED(a);
   NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#endif // GGML_CUDA_F16

static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
    }
    return x;
}

//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//#pragma unroll
//    for (int mask = 16; mask > 0; mask >>= 1) {
//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
//    }
//    return x;
//#else
//    GGML_UNUSED(x);
//    NO_DEVICE_CODE;
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
//}


#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
#if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int &>(c);
#else
    int8x4_t c;
    int16_t tmp;
#pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int &>(c);
#endif // __has_builtin(__builtin_elementwise_sub_sat)
}

static __device__ __forceinline__ int __vsub4(const int a, const int b) {
    return __vsubss4(a, b);
}

static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
}

static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        "
        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
        : "v"(a), "v"(b)
    );
#else
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
    return c;
}
#endif // defined(GGML_USE_HIPBLAS)

// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);


//////////////////////

struct ggml_cuda_device_info {
    int device_count;

    struct cuda_device_info {
        int     cc;                 // compute capability
        size_t  smpb;               // max. shared memory per block
        bool    vmm;                // virtual memory support
        size_t  vmm_granularity;    // granularity of virtual memory
        size_t  total_vram;
    };

    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};

    std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
};

const ggml_cuda_device_info & ggml_cuda_info();

void ggml_cuda_set_device(int device);
int ggml_cuda_get_device();

struct ggml_cuda_pool {
    virtual ~ggml_cuda_pool() = default;

    virtual void * alloc(size_t size, size_t * actual_size) = 0;
    virtual void free(void * ptr, size_t size) = 0;
};

template<typename T>
struct ggml_cuda_pool_alloc {
    ggml_cuda_pool * pool = nullptr;
    T * ptr = nullptr;
    size_t actual_size = 0;

    ggml_cuda_pool_alloc() = default;

    explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
    }

    ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
        alloc(size);
    }

    ~ggml_cuda_pool_alloc() {
        if (ptr != nullptr) {
            pool->free(ptr, actual_size);
        }
    }

    // size is in number of elements
    T * alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
        return ptr;
    }

    T * alloc(ggml_cuda_pool & pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }

    T * get() {
        return ptr;
    }

    ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
    ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
    ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
    ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
};


// backend interface

struct ggml_tensor_extra_gpu {
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
};

struct ggml_backend_cuda_context {
    int device;
    std::string name;
    cudaEvent_t copy_event = nullptr;

    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
    cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

    explicit ggml_backend_cuda_context(int device) :
        device(device),
        name(GGML_CUDA_NAME + std::to_string(device)) {
    }

    ~ggml_backend_cuda_context() {
        if (copy_event != nullptr) {
            CUDA_CHECK(cudaEventDestroy(copy_event));
        }
        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
            for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
                if (streams[i][j] != nullptr) {
                    CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
                }
            }
            if (cublas_handles[i] != nullptr) {
                CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
            }
        }
    }

    cudaStream_t stream(int device, int stream) {
        if (streams[device][stream] == nullptr) {
            ggml_cuda_set_device(device);
            CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
        }
        return streams[device][stream];
    }

    cudaStream_t stream() {
        return stream(device, 0);
    }

    cublasHandle_t cublas_handle(int device) {
        if (cublas_handles[device] == nullptr) {
            ggml_cuda_set_device(device);
            CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
            CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
        }
        return cublas_handles[device];
    }

    cublasHandle_t cublas_handle() {
        return cublas_handle(device);
    }

    // pool
    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];

    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);

    ggml_cuda_pool & pool(int device) {
        if (pools[device] == nullptr) {
            pools[device] = new_pool_for_device(device);
        }
        return *pools[device];
    }

    ggml_cuda_pool & pool() {
        return pool(device);
    }
};
cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00			`#pragma once`

sync : ggml (#6351) * sync : ggml ggml-ci * cuda : move GGML_CUDA_DMMV constants to dmmv.cuh --------- Co-authored-by: slaren <slarengh@gmail.com> 2024-03-29 16:45:46 +01:00			`#include "ggml.h"`
			`#include "ggml-cuda.h"`

cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00			`#include <memory>`

			`#if defined(GGML_USE_HIPBLAS)`
			`#define GGML_COMMON_DECL_HIP`
			`#define GGML_COMMON_IMPL_HIP`
			`#else`
			`#define GGML_COMMON_DECL_CUDA`
			`#define GGML_COMMON_IMPL_CUDA`
			`#endif`
sync : ggml (#6351) * sync : ggml ggml-ci * cuda : move GGML_CUDA_DMMV constants to dmmv.cuh --------- Co-authored-by: slaren <slarengh@gmail.com> 2024-03-29 16:45:46 +01:00			`#include "ggml-common.h"`
cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00
			`#include <cstdio>`
			`#include <array>`
			`#include <cassert>`
			`#include <cfloat>`
			`#include <string>`

			`#if defined(GGML_USE_HIPBLAS)`
			`#include <hip/hip_runtime.h>`
			`#include <hipblas/hipblas.h>`
			`#include <hip/hip_fp16.h>`
			`#ifdef __HIP_PLATFORM_AMD__`
			`// for rocblas_initialize()`
			`#include "rocblas/rocblas.h"`
			`#endif // __HIP_PLATFORM_AMD__`
			`#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F`
			`#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F`
			`#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F`
			`#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT`
			`#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT`
			`#define CUBLAS_OP_N HIPBLAS_OP_N`
			`#define CUBLAS_OP_T HIPBLAS_OP_T`
			`#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS`
			`#define CUBLAS_TF32_TENSOR_OP_MATH 0`
			`#define CUDA_R_16F HIPBLAS_R_16F`
			`#define CUDA_R_32F HIPBLAS_R_32F`
			`#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)`
			`#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6`
			`#define cublasCreate hipblasCreate`
			`#define cublasDestroy hipblasDestroy`
			`#define cublasGemmEx hipblasGemmEx`
			`#define cublasGemmBatchedEx hipblasGemmBatchedEx`
			`#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx`
			`#define cublasHandle_t hipblasHandle_t`
			`#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS`
			`#define cublasSetStream hipblasSetStream`
			`#define cublasSgemm hipblasSgemm`
			`#define cublasStatus_t hipblasStatus_t`
			`#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6`
			`#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer`
			`#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess`
			`#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess`
			`#define cudaDeviceProp hipDeviceProp_t`
			`#define cudaDeviceSynchronize hipDeviceSynchronize`
			`#define cudaError_t hipError_t`
			`#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled`
			`#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled`
			`#define cudaEventCreateWithFlags hipEventCreateWithFlags`
			`#define cudaEventDisableTiming hipEventDisableTiming`
			`#define cudaEventRecord hipEventRecord`
			`#define cudaEventSynchronize hipEventSynchronize`
			`#define cudaEvent_t hipEvent_t`
			`#define cudaEventDestroy hipEventDestroy`
			`#define cudaFree hipFree`
			`#define cudaFreeHost hipHostFree`
			`#define cudaGetDevice hipGetDevice`
			`#define cudaGetDeviceCount hipGetDeviceCount`
			`#define cudaGetDeviceProperties hipGetDeviceProperties`
			`#define cudaGetErrorString hipGetErrorString`
			`#define cudaGetLastError hipGetLastError`
			`#define cudaHostRegister hipHostRegister`
			`#define cudaHostRegisterPortable hipHostRegisterPortable`
			`#define cudaHostRegisterReadOnly hipHostRegisterReadOnly`
			`#define cudaHostUnregister hipHostUnregister`
			`#define cudaLaunchHostFunc hipLaunchHostFunc`
			`#ifdef GGML_HIP_UMA`
			`#define cudaMalloc hipMallocManaged`
			`#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)`
			`#else`
			`#define cudaMalloc hipMalloc`
			`#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)`
			`#endif`
			`#define cudaMemcpy hipMemcpy`
			`#define cudaMemcpyAsync hipMemcpyAsync`
			`#define cudaMemcpyPeerAsync hipMemcpyPeerAsync`
			`#define cudaMemcpy2DAsync hipMemcpy2DAsync`
			`#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice`
			`#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost`
			`#define cudaMemcpyHostToDevice hipMemcpyHostToDevice`
			`#define cudaMemcpyKind hipMemcpyKind`
			`#define cudaMemset hipMemset`
			`#define cudaMemsetAsync hipMemsetAsync`
			`#define cudaMemGetInfo hipMemGetInfo`
			`#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize`
			`#define cudaSetDevice hipSetDevice`
			`#define cudaStreamCreateWithFlags hipStreamCreateWithFlags`
			`#define cudaStreamDestroy hipStreamDestroy`
			`#define cudaStreamFireAndForget hipStreamFireAndForget`
			`#define cudaStreamNonBlocking hipStreamNonBlocking`
			`#define cudaStreamPerThread hipStreamPerThread`
			`#define cudaStreamSynchronize hipStreamSynchronize`
			`#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)`
			`#define cudaStream_t hipStream_t`
			`#define cudaSuccess hipSuccess`
			`#define __trap abort`
			`#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS`
			`#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED`
			`#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED`
			`#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE`
			`#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH`
			`#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR`
			`#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED`
			`#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR`
			`#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED`
			`#else`
			`#include <cuda_runtime.h>`
			`#include <cuda.h>`
			`#include <cublas_v2.h>`
			`#include <cuda_fp16.h>`

			`#if CUDART_VERSION < 11020`
			`#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED`
			`#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH`
			`#define CUBLAS_COMPUTE_16F CUDA_R_16F`
			`#define CUBLAS_COMPUTE_32F CUDA_R_32F`
			`#define cublasComputeType_t cudaDataType_t`
			`#endif // CUDART_VERSION < 11020`

			`#endif // defined(GGML_USE_HIPBLAS)`

			`#define STRINGIZE_IMPL(...) #__VA_ARGS__`
			`#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)`

			`#define WARP_SIZE 32`
			`#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)`

			`#define CC_PASCAL 600`
			`#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products`
			`#define CC_VOLTA 700`
			`#define CC_OFFSET_AMD 1000000`
			`#define CC_RDNA1 (CC_OFFSET_AMD + 1010)`
			`#define CC_RDNA2 (CC_OFFSET_AMD + 1030)`
			`#define CC_RDNA3 (CC_OFFSET_AMD + 1100)`

			`// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication`
			`// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant`
			`// for large computational tasks. the drawback is that this requires some extra amount of VRAM:`
			`// - 7B quantum model: +100-200 MB`
			`// - 13B quantum model: +200-400 MB`
			`//`
			`//#define GGML_CUDA_FORCE_MMQ`

			`// TODO: improve this to be correct for more hardware`
			`// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores`
			`#if !defined(GGML_CUDA_FORCE_MMQ)`
			`#define CUDA_USE_TENSOR_CORES`
			`#endif`

			`#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels`
			`#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available`

			`#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses`

			`#if defined(_MSC_VER)`
			`#pragma warning(disable: 4244 4267) // possible loss of data`
			`#endif`

			`#define GGML_CUDA_MAX_STREAMS 8`

			`[[noreturn]]`
			`void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);`

			`#define CUDA_CHECK_GEN(err, success, error_fn) \`
			`do { \`
			`auto err_ = (err); \`
			`if (err_ != (success)) { \`
			`ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \`
			`} \`
			`} while (0)`

			`#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)`

			`#if CUDART_VERSION >= 12000`
			`static const char * cublas_get_error_str(const cublasStatus_t err) {`
			`return cublasGetStatusString(err);`
			`}`
			`#else`
			`static const char * cublas_get_error_str(const cublasStatus_t err) {`
			`switch (err) {`
			`case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";`
			`case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";`
			`case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";`
			`case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";`
			`case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";`
			`case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";`
			`case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";`
			`case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";`
			`case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";`
			`default: return "unknown error";`
			`}`
			`}`
			`#endif // CUDART_VERSION >= 12000`

			`#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)`

			`#if !defined(GGML_USE_HIPBLAS)`
			`static const char * cu_get_error_str(CUresult err) {`
			`const char * err_str;`
			`cuGetErrorString(err, &err_str);`
			`return err_str;`
			`}`
			`#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)`
			`#endif`

			`#if CUDART_VERSION >= 11100`
			`#define GGML_CUDA_ASSUME(x) __builtin_assume(x)`
			`#else`
			`#define GGML_CUDA_ASSUME(x)`
			`#endif // CUDART_VERSION >= 11100`

			`#ifdef GGML_CUDA_F16`
			`typedef half dfloat; // dequantize float`
			`typedef half2 dfloat2;`
			`#else`
			`typedef float dfloat; // dequantize float`
			`typedef float2 dfloat2;`
			`#endif //GGML_CUDA_F16`

			`[[noreturn]]`
			`static __device__ void no_device_code(`
			`const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {`

			`#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)`
			`printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",`
			`file_name, line, function_name, arch);`
			`GGML_UNUSED(arch_list);`
			`#else`
			`printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",`
			`file_name, line, function_name, arch, arch_list);`
			`#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)`
			`__trap();`

			`GGML_UNUSED(no_device_code); // suppress unused function warning`
			`}`

			`#ifdef __CUDA_ARCH__`
			`#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))`
			`#else`
			`#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")`
			`#endif // __CUDA_ARCH__`

			`static __device__ __forceinline__ float warp_reduce_sum(float x) {`
			`#pragma unroll`
			`for (int mask = 16; mask > 0; mask >>= 1) {`
			`x += __shfl_xor_sync(0xffffffff, x, mask, 32);`
			`}`
			`return x;`
			`}`

			`static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {`
			`#pragma unroll`
			`for (int mask = 16; mask > 0; mask >>= 1) {`
			`a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);`
			`a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);`
			`}`
			`return a;`
			`}`

			`#ifdef GGML_CUDA_F16`
			`static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {`
			`#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL`
			`#pragma unroll`
			`for (int mask = 16; mask > 0; mask >>= 1) {`
			`a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));`
			`}`
			`return a;`
			`#else`
			`GGML_UNUSED(a);`
			`NO_DEVICE_CODE;`
			`#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL`
			`}`
			`#endif // GGML_CUDA_F16`

			`static __device__ __forceinline__ float warp_reduce_max(float x) {`
			`#pragma unroll`
			`for (int mask = 16; mask > 0; mask >>= 1) {`
			`x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));`
			`}`
			`return x;`
			`}`

			`//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {`
			`//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX`
			`//#pragma unroll`
			`// for (int mask = 16; mask > 0; mask >>= 1) {`
			`// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));`
			`// }`
			`// return x;`
			`//#else`
			`// GGML_UNUSED(x);`
			`// NO_DEVICE_CODE;`
			`//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX`
			`//}`


			`#if defined(GGML_USE_HIPBLAS)`
			`#define __CUDA_ARCH__ 1300`

			`#if defined(__gfx1100__) \|\| defined(__gfx1101__) \|\| defined(__gfx1102__) \|\| defined(__gfx1103__) \|\| \`
			`defined(__gfx1150__) \|\| defined(__gfx1151__)`
			`#define RDNA3`
			`#endif`

			`#if defined(__gfx1030__) \|\| defined(__gfx1031__) \|\| defined(__gfx1032__) \|\| defined(__gfx1033__) \|\| \`
			`defined(__gfx1034__) \|\| defined(__gfx1035__) \|\| defined(__gfx1036__) \|\| defined(__gfx1037__)`
			`#define RDNA2`
			`#endif`

			`#ifndef __has_builtin`
			`#define __has_builtin(x) 0`
			`#endif`

			`typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));`
			`typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));`
			`static __device__ __forceinline__ int __vsubss4(const int a, const int b) {`
			`const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);`
			`const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);`
			`#if __has_builtin(__builtin_elementwise_sub_sat)`
			`const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);`
			`return reinterpret_cast<const int &>(c);`
			`#else`
			`int8x4_t c;`
			`int16_t tmp;`
			`#pragma unroll`
			`for (int i = 0; i < 4; i++) {`
			`tmp = va[i] - vb[i];`
			`if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();`
			`if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();`
			`c[i] = tmp;`
			`}`
			`return reinterpret_cast<int &>(c);`
			`#endif // __has_builtin(__builtin_elementwise_sub_sat)`
			`}`

			`static __device__ __forceinline__ int __vsub4(const int a, const int b) {`
			`return __vsubss4(a, b);`
			`}`

			`static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {`
			`const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);`
			`const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);`
			`unsigned int c;`
			`uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);`
			`#pragma unroll`
			`for (int i = 0; i < 4; ++i) {`
			`vc[i] = va[i] == vb[i] ? 0xff : 0x00;`
			`}`
			`return c;`
			`}`

			`static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {`
			`#if defined(__gfx906__) \|\| defined(__gfx908__) \|\| defined(__gfx90a__) \|\| defined(__gfx1030__)`
			`c = __builtin_amdgcn_sdot4(a, b, c, false);`
			`#elif defined(RDNA3)`
			`c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);`
			`#elif defined(__gfx1010__) \|\| defined(__gfx900__)`
			`int tmp1;`
			`int tmp2;`
			`asm("\n \`
			`v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \`
			`v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \`
			`v_add3_u32 %0, %1, %2, %0 \n \`
			`v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \`
			`v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \`
			`v_add3_u32 %0, %1, %2, %0 \n \`
			`"`
			`: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)`
			`: "v"(a), "v"(b)`
			`);`
			`#else`
			`const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);`
			`const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);`
			`c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];`
			`#endif`
			`return c;`
			`}`
			`#endif // defined(GGML_USE_HIPBLAS)`

			`// TODO: move to ggml-common.h`
			`static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};`

			`typedef void (dequantize_kernel_t)(const void vx, const int ib, const int iqs, dfloat2 & v);`


			`//////////////////////`

			`struct ggml_cuda_device_info {`
			`int device_count;`

			`struct cuda_device_info {`
			`int cc; // compute capability`
			`size_t smpb; // max. shared memory per block`
			`bool vmm; // virtual memory support`
			`size_t vmm_granularity; // granularity of virtual memory`
			`size_t total_vram;`
			`};`

			`cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};`

			`std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};`
			`};`

			`const ggml_cuda_device_info & ggml_cuda_info();`

			`void ggml_cuda_set_device(int device);`
			`int ggml_cuda_get_device();`

			`struct ggml_cuda_pool {`
			`virtual ~ggml_cuda_pool() = default;`

			`virtual void * alloc(size_t size, size_t * actual_size) = 0;`
			`virtual void free(void * ptr, size_t size) = 0;`
			`};`

			`template<typename T>`
			`struct ggml_cuda_pool_alloc {`
			`ggml_cuda_pool * pool = nullptr;`
			`T * ptr = nullptr;`
			`size_t actual_size = 0;`

			`ggml_cuda_pool_alloc() = default;`

			`explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {`
			`}`

			`ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {`
			`alloc(size);`
			`}`

			`~ggml_cuda_pool_alloc() {`
			`if (ptr != nullptr) {`
			`pool->free(ptr, actual_size);`
			`}`
			`}`

			`// size is in number of elements`
			`T * alloc(size_t size) {`
			`GGML_ASSERT(pool != nullptr);`
			`GGML_ASSERT(ptr == nullptr);`
			`ptr = (T ) pool->alloc(size sizeof(T), &this->actual_size);`
			`return ptr;`
			`}`

			`T * alloc(ggml_cuda_pool & pool, size_t size) {`
			`this->pool = &pool;`
			`return alloc(size);`
			`}`

			`T * get() {`
			`return ptr;`
			`}`

			`ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;`
			`ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;`
			`ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;`
			`ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;`
			`};`


			`// backend interface`

			`struct ggml_tensor_extra_gpu {`
			`void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors`
			`cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs`
			`};`

			`struct ggml_backend_cuda_context {`
			`int device;`
			`std::string name;`
			`cudaEvent_t copy_event = nullptr;`

			`cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };`
			`cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};`

			`explicit ggml_backend_cuda_context(int device) :`
			`device(device),`
			`name(GGML_CUDA_NAME + std::to_string(device)) {`
			`}`

			`~ggml_backend_cuda_context() {`
			`if (copy_event != nullptr) {`
			`CUDA_CHECK(cudaEventDestroy(copy_event));`
			`}`
			`for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {`
			`for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {`
			`if (streams[i][j] != nullptr) {`
			`CUDA_CHECK(cudaStreamDestroy(streams[i][j]));`
			`}`
			`}`
			`if (cublas_handles[i] != nullptr) {`
			`CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));`
			`}`
			`}`
			`}`

			`cudaStream_t stream(int device, int stream) {`
			`if (streams[device][stream] == nullptr) {`
			`ggml_cuda_set_device(device);`
			`CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));`
			`}`
			`return streams[device][stream];`
			`}`

			`cudaStream_t stream() {`
			`return stream(device, 0);`
			`}`

			`cublasHandle_t cublas_handle(int device) {`
			`if (cublas_handles[device] == nullptr) {`
			`ggml_cuda_set_device(device);`
			`CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));`
			`CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));`
			`}`
			`return cublas_handles[device];`
			`}`

			`cublasHandle_t cublas_handle() {`
			`return cublas_handle(device);`
			`}`

			`// pool`
			`std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];`

			`static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);`

			`ggml_cuda_pool & pool(int device) {`
			`if (pools[device] == nullptr) {`
			`pools[device] = new_pool_for_device(device);`
			`}`
			`return *pools[device];`
			`}`

			`ggml_cuda_pool & pool() {`
			`return pool(device);`
			`}`
			`};`