llama.cpp/ggml-cuda/quantize.cuh

#pragma once

#include "common.cuh"
#include "mmq.cuh"

#include <cstdint>

#define CUDA_QUANTIZE_BLOCK_SIZE 256

typedef void (*quantize_cuda_t)(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);

void quantize_row_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);

void quantize_mmq_q8_1_cuda(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
    const ggml_type type_x, cudaStream_t stream);
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`#pragma once`

cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00			`#include "common.cuh"`
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`#include "mmq.cuh"`

			`#include <cstdint>`
cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00
			`#define CUDA_QUANTIZE_BLOCK_SIZE 256`

CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`typedef void (*quantize_cuda_t)(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`

			`void quantize_row_q8_1_cuda(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`

			`void quantize_mmq_q8_1_cuda(`
			`const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,`
			`const ggml_type type_x, cudaStream_t stream);`