2024-03-25 13:50:23 +01:00
|
|
|
#pragma once
|
|
|
|
|
2024-03-29 16:45:46 +01:00
|
|
|
#include "ggml.h"
|
|
|
|
#include "ggml-cuda.h"
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
#include <memory>
|
|
|
|
|
|
|
|
#if defined(GGML_USE_HIPBLAS)
|
|
|
|
#define GGML_COMMON_DECL_HIP
|
|
|
|
#define GGML_COMMON_IMPL_HIP
|
|
|
|
#else
|
|
|
|
#define GGML_COMMON_DECL_CUDA
|
|
|
|
#define GGML_COMMON_IMPL_CUDA
|
|
|
|
#endif
|
2024-03-29 16:45:46 +01:00
|
|
|
#include "ggml-common.h"
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
#include <cstdio>
|
|
|
|
#include <array>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cfloat>
|
|
|
|
#include <string>
|
2024-05-08 22:55:49 +02:00
|
|
|
#include <vector>
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
#if defined(GGML_USE_HIPBLAS)
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
#include <hipblas/hipblas.h>
|
|
|
|
#include <hip/hip_fp16.h>
|
|
|
|
#ifdef __HIP_PLATFORM_AMD__
|
|
|
|
// for rocblas_initialize()
|
|
|
|
#include "rocblas/rocblas.h"
|
|
|
|
#endif // __HIP_PLATFORM_AMD__
|
|
|
|
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
|
|
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
|
|
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
|
|
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
|
|
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
|
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
|
|
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
|
|
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
|
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
|
|
#define CUDA_R_16F HIPBLAS_R_16F
|
|
|
|
#define CUDA_R_32F HIPBLAS_R_32F
|
|
|
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
|
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
|
|
|
#define cublasCreate hipblasCreate
|
|
|
|
#define cublasDestroy hipblasDestroy
|
|
|
|
#define cublasGemmEx hipblasGemmEx
|
|
|
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
|
|
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
|
|
|
#define cublasHandle_t hipblasHandle_t
|
|
|
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
|
|
#define cublasSetStream hipblasSetStream
|
|
|
|
#define cublasSgemm hipblasSgemm
|
|
|
|
#define cublasStatus_t hipblasStatus_t
|
|
|
|
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
|
|
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
|
|
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
|
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
|
|
#define cudaDeviceProp hipDeviceProp_t
|
|
|
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
|
|
#define cudaError_t hipError_t
|
|
|
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
|
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
|
|
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
|
|
|
#define cudaEventDisableTiming hipEventDisableTiming
|
|
|
|
#define cudaEventRecord hipEventRecord
|
|
|
|
#define cudaEventSynchronize hipEventSynchronize
|
|
|
|
#define cudaEvent_t hipEvent_t
|
|
|
|
#define cudaEventDestroy hipEventDestroy
|
|
|
|
#define cudaFree hipFree
|
|
|
|
#define cudaFreeHost hipHostFree
|
|
|
|
#define cudaGetDevice hipGetDevice
|
|
|
|
#define cudaGetDeviceCount hipGetDeviceCount
|
|
|
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
|
|
|
#define cudaGetErrorString hipGetErrorString
|
|
|
|
#define cudaGetLastError hipGetLastError
|
|
|
|
#define cudaHostRegister hipHostRegister
|
|
|
|
#define cudaHostRegisterPortable hipHostRegisterPortable
|
|
|
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
|
|
|
#define cudaHostUnregister hipHostUnregister
|
|
|
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
|
|
#ifdef GGML_HIP_UMA
|
|
|
|
#define cudaMalloc hipMallocManaged
|
|
|
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
|
|
|
#else
|
|
|
|
#define cudaMalloc hipMalloc
|
|
|
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
|
|
#endif
|
|
|
|
#define cudaMemcpy hipMemcpy
|
|
|
|
#define cudaMemcpyAsync hipMemcpyAsync
|
|
|
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
|
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
|
|
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
|
|
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
|
|
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
|
|
#define cudaMemcpyKind hipMemcpyKind
|
|
|
|
#define cudaMemset hipMemset
|
|
|
|
#define cudaMemsetAsync hipMemsetAsync
|
|
|
|
#define cudaMemGetInfo hipMemGetInfo
|
|
|
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
|
|
|
#define cudaSetDevice hipSetDevice
|
|
|
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
|
|
#define cudaStreamDestroy hipStreamDestroy
|
|
|
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
|
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
|
|
|
#define cudaStreamPerThread hipStreamPerThread
|
|
|
|
#define cudaStreamSynchronize hipStreamSynchronize
|
|
|
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
|
|
|
#define cudaStream_t hipStream_t
|
|
|
|
#define cudaSuccess hipSuccess
|
|
|
|
#define __trap abort
|
|
|
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
|
|
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
|
|
|
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
|
|
|
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
|
|
|
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
|
|
|
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
|
|
|
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
|
|
|
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
|
|
|
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
|
|
|
#else
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <cuda.h>
|
|
|
|
#include <cublas_v2.h>
|
|
|
|
#include <cuda_fp16.h>
|
|
|
|
|
|
|
|
#if CUDART_VERSION < 11020
|
|
|
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
|
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
|
|
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
|
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
|
|
#define cublasComputeType_t cudaDataType_t
|
|
|
|
#endif // CUDART_VERSION < 11020
|
|
|
|
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS)
|
|
|
|
|
|
|
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
|
|
|
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define WARP_SIZE 32
|
2024-05-01 14:46:37 +02:00
|
|
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
|
|
|
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
#define CC_PASCAL 600
|
|
|
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
|
|
|
#define CC_VOLTA 700
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
#define CC_AMPERE 800
|
2024-03-25 13:50:23 +01:00
|
|
|
#define CC_OFFSET_AMD 1000000
|
|
|
|
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
|
|
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
|
|
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
|
|
|
|
|
|
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
|
|
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
|
|
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
|
|
|
// - 7B quantum model: +100-200 MB
|
|
|
|
// - 13B quantum model: +200-400 MB
|
|
|
|
//
|
|
|
|
//#define GGML_CUDA_FORCE_MMQ
|
|
|
|
|
|
|
|
// TODO: improve this to be correct for more hardware
|
|
|
|
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
|
|
|
#if !defined(GGML_CUDA_FORCE_MMQ)
|
|
|
|
#define CUDA_USE_TENSOR_CORES
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
|
|
|
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
|
|
|
|
|
|
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
|
|
|
|
|
|
#if defined(_MSC_VER)
|
|
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define GGML_CUDA_MAX_STREAMS 8
|
|
|
|
|
|
|
|
[[noreturn]]
|
|
|
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
|
|
|
|
|
|
|
|
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
|
|
|
do { \
|
|
|
|
auto err_ = (err); \
|
|
|
|
if (err_ != (success)) { \
|
|
|
|
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
|
|
|
|
|
|
|
#if CUDART_VERSION >= 12000
|
|
|
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
|
|
|
return cublasGetStatusString(err);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
|
|
|
switch (err) {
|
|
|
|
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
|
|
|
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
|
|
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
|
|
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
|
|
|
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
|
|
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
|
|
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
|
|
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
|
|
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
|
|
|
default: return "unknown error";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // CUDART_VERSION >= 12000
|
|
|
|
|
|
|
|
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
|
|
|
|
|
|
|
#if !defined(GGML_USE_HIPBLAS)
|
|
|
|
static const char * cu_get_error_str(CUresult err) {
|
|
|
|
const char * err_str;
|
|
|
|
cuGetErrorString(err, &err_str);
|
|
|
|
return err_str;
|
|
|
|
}
|
|
|
|
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if CUDART_VERSION >= 11100
|
|
|
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
|
|
#else
|
|
|
|
#define GGML_CUDA_ASSUME(x)
|
|
|
|
#endif // CUDART_VERSION >= 11100
|
|
|
|
|
|
|
|
#ifdef GGML_CUDA_F16
|
|
|
|
typedef half dfloat; // dequantize float
|
|
|
|
typedef half2 dfloat2;
|
|
|
|
#else
|
|
|
|
typedef float dfloat; // dequantize float
|
|
|
|
typedef float2 dfloat2;
|
|
|
|
#endif //GGML_CUDA_F16
|
|
|
|
|
2024-05-09 14:32:02 +02:00
|
|
|
#if defined(GGML_USE_HIPBLAS)
|
|
|
|
#define __CUDA_ARCH__ 1300
|
|
|
|
|
|
|
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
|
|
defined(__gfx1150__) || defined(__gfx1151__)
|
|
|
|
#define RDNA3
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
|
|
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
|
|
|
#define RDNA2
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef __has_builtin
|
|
|
|
#define __has_builtin(x) 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
|
|
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
|
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
|
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
|
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
|
|
#if __has_builtin(__builtin_elementwise_sub_sat)
|
|
|
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
|
|
|
return reinterpret_cast<const int &>(c);
|
|
|
|
#else
|
|
|
|
int8x4_t c;
|
|
|
|
int16_t tmp;
|
|
|
|
#pragma unroll
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
tmp = va[i] - vb[i];
|
|
|
|
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
|
|
|
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
|
|
|
c[i] = tmp;
|
|
|
|
}
|
|
|
|
return reinterpret_cast<int &>(c);
|
|
|
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
|
|
return __vsubss4(a, b);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
|
|
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
|
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
|
|
unsigned int c;
|
|
|
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
|
|
#pragma unroll
|
|
|
|
for (int i = 0; i < 4; ++i) {
|
|
|
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
|
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
|
|
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
|
|
|
#elif defined(RDNA3)
|
|
|
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
|
|
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
|
|
|
int tmp1;
|
|
|
|
int tmp2;
|
|
|
|
asm("\n \
|
|
|
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
|
|
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
|
|
|
v_add3_u32 %0, %1, %2, %0 \n \
|
|
|
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
|
|
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
|
|
|
v_add3_u32 %0, %1, %2, %0 \n \
|
|
|
|
"
|
|
|
|
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
|
|
|
: "v"(a), "v"(b)
|
|
|
|
);
|
|
|
|
#else
|
|
|
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
|
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
|
|
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
|
|
|
#endif
|
|
|
|
return c;
|
|
|
|
}
|
2024-05-18 10:05:17 +02:00
|
|
|
|
|
|
|
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
|
|
// __shfl_xor() for half2 was added in ROCm 5.6
|
|
|
|
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
|
|
|
typedef union half2_b32 {
|
|
|
|
half2 val;
|
|
|
|
int b32;
|
|
|
|
} half2_b32_t;
|
|
|
|
half2_b32_t tmp;
|
|
|
|
tmp.val = var;
|
|
|
|
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
|
|
|
return tmp.val;
|
|
|
|
}
|
|
|
|
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
2024-05-09 14:32:02 +02:00
|
|
|
#endif // defined(GGML_USE_HIPBLAS)
|
|
|
|
|
|
|
|
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
|
|
|
|
|
|
|
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
|
|
|
|
2024-05-12 19:40:45 +02:00
|
|
|
static bool fast_fp16_available(const int cc) {
|
|
|
|
return cc >= CC_PASCAL && cc != 610;
|
|
|
|
}
|
|
|
|
|
2024-05-09 14:32:02 +02:00
|
|
|
static bool fp16_mma_available(const int cc) {
|
|
|
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
|
|
|
}
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
[[noreturn]]
|
|
|
|
static __device__ void no_device_code(
|
|
|
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
|
|
|
|
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
|
|
|
file_name, line, function_name, arch);
|
|
|
|
GGML_UNUSED(arch_list);
|
|
|
|
#else
|
|
|
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
|
|
|
file_name, line, function_name, arch, arch_list);
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
__trap();
|
|
|
|
|
|
|
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__
|
|
|
|
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
|
|
|
#else
|
|
|
|
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
|
|
|
#endif // __CUDA_ARCH__
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
|
|
#pragma unroll
|
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
|
|
#pragma unroll
|
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
|
|
|
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
|
|
|
}
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
2024-05-09 14:32:02 +02:00
|
|
|
#if FP16_AVAILABLE
|
|
|
|
|
|
|
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2024-03-25 13:50:23 +01:00
|
|
|
#pragma unroll
|
2024-05-09 14:32:02 +02:00
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
|
|
|
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
|
|
|
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
|
|
|
}
|
|
|
|
return a;
|
2024-03-25 13:50:23 +01:00
|
|
|
#else
|
2024-05-09 14:32:02 +02:00
|
|
|
#pragma unroll
|
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
|
|
|
}
|
|
|
|
return a;
|
|
|
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
|
|
|
|
#else
|
|
|
|
NO_DEVICE_CODE;
|
|
|
|
return a;
|
|
|
|
#endif // FP16_AVAILABLE
|
2024-03-25 13:50:23 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
|
|
#pragma unroll
|
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2024-05-01 14:46:37 +02:00
|
|
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
2024-05-09 14:32:02 +02:00
|
|
|
#if FP16_AVAILABLE
|
2024-05-01 14:46:37 +02:00
|
|
|
|
2024-05-09 14:32:02 +02:00
|
|
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
|
|
|
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
2024-05-01 14:46:37 +02:00
|
|
|
#else
|
2024-05-09 14:32:02 +02:00
|
|
|
return __hmax(a, b);
|
|
|
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
2024-05-01 14:46:37 +02:00
|
|
|
|
|
|
|
#else
|
2024-05-09 14:32:02 +02:00
|
|
|
NO_DEVICE_CODE;
|
|
|
|
GGML_UNUSED(b);
|
|
|
|
return a;
|
|
|
|
#endif // FP16_AVAILABLE
|
2024-05-01 14:46:37 +02:00
|
|
|
}
|
2024-05-09 14:32:02 +02:00
|
|
|
|
2024-05-01 14:46:37 +02:00
|
|
|
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
|
|
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
|
|
|
|
|
|
#if CUDART_VERSION >= CUDART_HMAX
|
|
|
|
return __hmax2(a, b);
|
|
|
|
#else
|
|
|
|
half2 ret;
|
2024-05-09 14:32:02 +02:00
|
|
|
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
|
|
|
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
2024-05-01 14:46:37 +02:00
|
|
|
return ret;
|
|
|
|
#endif // CUDART_VERSION >= CUDART_HMAX
|
|
|
|
|
|
|
|
#else
|
|
|
|
GGML_UNUSED(a);
|
|
|
|
GGML_UNUSED(b);
|
|
|
|
NO_DEVICE_CODE;
|
2024-05-09 14:32:02 +02:00
|
|
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
2024-05-01 14:46:37 +02:00
|
|
|
}
|
|
|
|
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
2024-05-01 14:46:37 +02:00
|
|
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
#pragma unroll
|
|
|
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
2024-05-01 14:46:37 +02:00
|
|
|
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
|
|
|
return x;
|
|
|
|
#else
|
|
|
|
GGML_UNUSED(x);
|
|
|
|
NO_DEVICE_CODE;
|
2024-05-01 14:46:37 +02:00
|
|
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
}
|
2024-03-25 13:50:23 +01:00
|
|
|
|
2024-05-01 14:46:37 +02:00
|
|
|
#if CUDART_VERSION < CUDART_HMASK
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
|
|
|
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
|
|
|
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
|
|
|
return mask_low | mask_high;
|
|
|
|
}
|
|
|
|
#endif // CUDART_VERSION < 12000
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
// TODO: move to ggml-common.h
|
|
|
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
|
|
|
2024-04-09 10:16:13 +02:00
|
|
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
2024-03-25 13:50:23 +01:00
|
|
|
|
2024-05-18 12:36:25 +02:00
|
|
|
static __device__ __forceinline__ float get_alibi_slope(
|
|
|
|
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
|
|
|
|
) {
|
|
|
|
if (max_bias <= 0.0f) {
|
|
|
|
return 1.0f;
|
|
|
|
}
|
|
|
|
const float base = h < n_head_log2 ? m0 : m1;
|
|
|
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
|
|
|
|
|
|
return powf(base, exph);
|
|
|
|
}
|
2024-03-25 13:50:23 +01:00
|
|
|
|
|
|
|
//////////////////////
|
|
|
|
|
|
|
|
struct ggml_cuda_device_info {
|
|
|
|
int device_count;
|
|
|
|
|
|
|
|
struct cuda_device_info {
|
|
|
|
int cc; // compute capability
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 11:16:08 +02:00
|
|
|
int nsm; // number of streaming multiprocessors
|
2024-03-25 13:50:23 +01:00
|
|
|
size_t smpb; // max. shared memory per block
|
|
|
|
bool vmm; // virtual memory support
|
|
|
|
size_t vmm_granularity; // granularity of virtual memory
|
|
|
|
size_t total_vram;
|
|
|
|
};
|
|
|
|
|
|
|
|
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
|
|
|
|
|
|
|
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
|
|
|
};
|
|
|
|
|
|
|
|
const ggml_cuda_device_info & ggml_cuda_info();
|
|
|
|
|
|
|
|
void ggml_cuda_set_device(int device);
|
|
|
|
int ggml_cuda_get_device();
|
|
|
|
|
|
|
|
struct ggml_cuda_pool {
|
|
|
|
virtual ~ggml_cuda_pool() = default;
|
|
|
|
|
|
|
|
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
|
|
|
virtual void free(void * ptr, size_t size) = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
struct ggml_cuda_pool_alloc {
|
|
|
|
ggml_cuda_pool * pool = nullptr;
|
|
|
|
T * ptr = nullptr;
|
|
|
|
size_t actual_size = 0;
|
|
|
|
|
|
|
|
ggml_cuda_pool_alloc() = default;
|
|
|
|
|
|
|
|
explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
|
|
|
|
alloc(size);
|
|
|
|
}
|
|
|
|
|
|
|
|
~ggml_cuda_pool_alloc() {
|
|
|
|
if (ptr != nullptr) {
|
|
|
|
pool->free(ptr, actual_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// size is in number of elements
|
|
|
|
T * alloc(size_t size) {
|
|
|
|
GGML_ASSERT(pool != nullptr);
|
|
|
|
GGML_ASSERT(ptr == nullptr);
|
|
|
|
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
T * alloc(ggml_cuda_pool & pool, size_t size) {
|
|
|
|
this->pool = &pool;
|
|
|
|
return alloc(size);
|
|
|
|
}
|
|
|
|
|
|
|
|
T * get() {
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
|
|
|
|
ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
|
|
|
|
ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
|
|
|
|
ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// backend interface
|
|
|
|
|
|
|
|
struct ggml_tensor_extra_gpu {
|
|
|
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
|
|
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
|
|
|
};
|
|
|
|
|
2024-05-08 22:55:49 +02:00
|
|
|
|
|
|
|
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
|
|
|
|
#define USE_CUDA_GRAPH
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct ggml_graph_node_properties {
|
|
|
|
void * node_address;
|
|
|
|
ggml_op node_op;
|
|
|
|
int64_t ne[GGML_MAX_DIMS];
|
|
|
|
size_t nb[GGML_MAX_DIMS];
|
|
|
|
void * src_address[GGML_MAX_SRC];
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ggml_cuda_graph {
|
|
|
|
#ifdef USE_CUDA_GRAPH
|
|
|
|
~ggml_cuda_graph() {
|
|
|
|
if (instance != nullptr) {
|
|
|
|
CUDA_CHECK(cudaGraphExecDestroy(instance));
|
|
|
|
}
|
|
|
|
if (graph != nullptr) {
|
|
|
|
CUDA_CHECK(cudaGraphDestroy(graph));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cudaGraph_t graph = nullptr;
|
|
|
|
cudaGraphExec_t instance = nullptr;
|
|
|
|
size_t num_nodes = 0;
|
|
|
|
std::vector<cudaGraphNode_t> nodes;
|
|
|
|
std::vector<cudaKernelNodeParams> params;
|
|
|
|
bool disable_due_to_gpu_arch = false;
|
|
|
|
bool disable_due_to_too_many_updates = false;
|
|
|
|
bool disable_due_to_failed_graph_capture = false;
|
|
|
|
int number_consecutive_updates = 0;
|
|
|
|
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
|
|
|
std::vector<char **> updated_kernel_arg;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
struct ggml_backend_cuda_context {
|
|
|
|
int device;
|
|
|
|
std::string name;
|
|
|
|
cudaEvent_t copy_event = nullptr;
|
|
|
|
|
|
|
|
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
|
|
|
|
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
|
|
|
|
2024-05-08 22:55:49 +02:00
|
|
|
std::unique_ptr<ggml_cuda_graph> cuda_graph;
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
explicit ggml_backend_cuda_context(int device) :
|
|
|
|
device(device),
|
|
|
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
|
|
|
}
|
|
|
|
|
|
|
|
~ggml_backend_cuda_context() {
|
|
|
|
if (copy_event != nullptr) {
|
|
|
|
CUDA_CHECK(cudaEventDestroy(copy_event));
|
|
|
|
}
|
|
|
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
|
|
|
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
|
|
|
if (streams[i][j] != nullptr) {
|
|
|
|
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (cublas_handles[i] != nullptr) {
|
|
|
|
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cudaStream_t stream(int device, int stream) {
|
|
|
|
if (streams[device][stream] == nullptr) {
|
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
|
|
|
|
}
|
|
|
|
return streams[device][stream];
|
|
|
|
}
|
|
|
|
|
|
|
|
cudaStream_t stream() {
|
|
|
|
return stream(device, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
cublasHandle_t cublas_handle(int device) {
|
|
|
|
if (cublas_handles[device] == nullptr) {
|
|
|
|
ggml_cuda_set_device(device);
|
|
|
|
CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
|
|
|
|
CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
|
|
|
|
}
|
|
|
|
return cublas_handles[device];
|
|
|
|
}
|
|
|
|
|
|
|
|
cublasHandle_t cublas_handle() {
|
|
|
|
return cublas_handle(device);
|
|
|
|
}
|
|
|
|
|
|
|
|
// pool
|
|
|
|
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
|
|
|
|
|
|
|
static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
|
|
|
|
|
|
|
|
ggml_cuda_pool & pool(int device) {
|
|
|
|
if (pools[device] == nullptr) {
|
|
|
|
pools[device] = new_pool_for_device(device);
|
|
|
|
}
|
|
|
|
return *pools[device];
|
|
|
|
}
|
|
|
|
|
|
|
|
ggml_cuda_pool & pool() {
|
|
|
|
return pool(device);
|
|
|
|
}
|
|
|
|
};
|