2024-06-09 09:42:25 +02:00
|
|
|
#pragma once
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
#include "common.cuh"
|
2024-06-09 09:42:25 +02:00
|
|
|
#include "mmq.cuh"
|
|
|
|
|
|
|
|
#include <cstdint>
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
|
|
|
|
2024-06-09 09:42:25 +02:00
|
|
|
typedef void (*quantize_cuda_t)(
|
|
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
|
|
|
|
void quantize_row_q8_1_cuda(
|
|
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
|
|
const ggml_type type_x, cudaStream_t stream);
|
|
|
|
|
|
|
|
void quantize_mmq_q8_1_cuda(
|
|
|
|
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
|
|
const ggml_type type_x, cudaStream_t stream);
|