llama.cpp/ggml-cuda.h

#include <cublas_v2.h>
#include <cuda_runtime.h>

#ifdef  __cplusplus
extern "C" {
#endif

#define CUDA_CHECK(err)                                                                 \
    do {                                                                                \
        cudaError_t err_ = (err);                                                       \
        if (err_ != cudaSuccess) {                                                      \
            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
                cudaGetErrorString(err_));                                              \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)

#define CUBLAS_CHECK(err)                                                               \
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)

extern cublasHandle_t g_cublasH;
extern cudaStream_t   g_cudaStream;

void   ggml_init_cublas(void);
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void   ggml_cuda_pool_free(void * ptr, size_t size);

void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);

#ifdef  __cplusplus
}
#endif
Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 2023-04-21 21:59:17 +02:00			`#include <cublas_v2.h>`
			`#include <cuda_runtime.h>`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 03:14:14 +02:00			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 2023-04-21 21:59:17 +02:00			`#define CUDA_CHECK(err) \`
			`do { \`
			`cudaError_t err_ = (err); \`
			`if (err_ != cudaSuccess) { \`
			`fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \`
			`cudaGetErrorString(err_)); \`
			`exit(1); \`
			`} \`
			`} while (0)`

			`#define CUBLAS_CHECK(err) \`
			`do { \`
			`cublasStatus_t err_ = (err); \`
			`if (err_ != CUBLAS_STATUS_SUCCESS) { \`
			`fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \`
			`exit(1); \`
			`} \`
			`} while (0)`

			`extern cublasHandle_t g_cublasH;`
			`extern cudaStream_t g_cudaStream;`

			`void ggml_init_cublas(void);`
			`void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);`
			`void ggml_cuda_pool_free(void * ptr, size_t size);`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 03:14:14 +02:00			`void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
			`void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
			`void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
Add Q4_3 support to cuBLAS (#1086) 2023-04-20 20:49:53 +02:00			`void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
ggml : add Q5_0 and Q5_1 quantization (#1187) * ggml : add Q5_0 quantization (cuBLAS only) * ggml : fix Q5_0 qh -> uint32_t * ggml : fix q5_0 histogram stats * ggml : q5_0 scalar dot product * ggml : q5_0 ARM NEON dot * ggml : q5_0 more efficient ARM NEON using uint64_t masks * ggml : rename Q5_0 -> Q5_1 * ggml : adding Q5_0 mode * quantize : add Q5_0 and Q5_1 to map * ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195) --------- Co-authored-by: Stephan Walter <stephan@walter.name> 2023-04-26 22:14:13 +02:00			`void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
			`void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179) * ggml : add Q8_0 quantization format (rename the old one to Q8_1) * tests : fix test-quantize-fns * ggml : finalize Q8_0 implementation * ggml : use q4_0_q8_0 and q4_2_q8_0 * ggml : fix Q8_0 dot product bug (ARM) * ggml : Q8_0 unroll x2 * ggml : fix bug - using wrong block type * ggml : extend quantize_fns_t with "vec_dot_type" * ggml : fix Q8_0 to use 255 values out of 256 * ggml : fix assert using wrong QK4_2 instead of QK4_3 2023-04-25 22:40:51 +02:00			`void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);`
Improve cuBLAS performance by dequantizing on the GPU (#1065) 2023-04-20 03:14:14 +02:00
			`#ifdef __cplusplus`
			`}`
			`#endif`