diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 080193cbd..98343d208 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -443,7 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_SCALE_BLOCK_SIZE 256 #define CUDA_CLAMP_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256 -#define CUDA_SOFT_MAX_BLOCK_SIZE 512 +#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 #define CUDA_ALIBI_BLOCK_SIZE 32 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_QUANTIZE_BLOCK_SIZE 256