mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 14:28:58 +01:00
cuda : organize vendor-specific headers into vendors directory (#8746)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
parent
0832de7236
commit
439b3fc75a
@ -27,255 +27,11 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#include <hip/hip_runtime.h>
|
#include "vendors/hip.h"
|
||||||
#include <hipblas/hipblas.h>
|
|
||||||
#include <hip/hip_fp16.h>
|
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
|
||||||
// for rocblas_initialize()
|
|
||||||
#include "rocblas/rocblas.h"
|
|
||||||
#endif // __HIP_PLATFORM_AMD__
|
|
||||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
|
||||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
||||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
|
||||||
#define cublasCreate hipblasCreate
|
|
||||||
#define cublasDestroy hipblasDestroy
|
|
||||||
#define cublasGemmEx hipblasGemmEx
|
|
||||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
||||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
|
||||||
#define cublasHandle_t hipblasHandle_t
|
|
||||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
||||||
#define cublasSetStream hipblasSetStream
|
|
||||||
#define cublasSgemm hipblasSgemm
|
|
||||||
#define cublasStatus_t hipblasStatus_t
|
|
||||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
|
||||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
||||||
#define cudaError_t hipError_t
|
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
||||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
|
||||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
|
||||||
#define cudaEventDisableTiming hipEventDisableTiming
|
|
||||||
#define cudaEventRecord hipEventRecord
|
|
||||||
#define cudaEventSynchronize hipEventSynchronize
|
|
||||||
#define cudaEvent_t hipEvent_t
|
|
||||||
#define cudaEventDestroy hipEventDestroy
|
|
||||||
#define cudaFree hipFree
|
|
||||||
#define cudaFreeHost hipHostFree
|
|
||||||
#define cudaGetDevice hipGetDevice
|
|
||||||
#define cudaGetDeviceCount hipGetDeviceCount
|
|
||||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
|
||||||
#define cudaGetErrorString hipGetErrorString
|
|
||||||
#define cudaGetLastError hipGetLastError
|
|
||||||
#define cudaHostRegister hipHostRegister
|
|
||||||
#define cudaHostRegisterPortable hipHostRegisterPortable
|
|
||||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
|
||||||
#define cudaHostUnregister hipHostUnregister
|
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
||||||
#define cudaMalloc hipMalloc
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
||||||
#define cudaMemcpy hipMemcpy
|
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
|
||||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
|
||||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
|
||||||
#define cudaMemset hipMemset
|
|
||||||
#define cudaMemsetAsync hipMemsetAsync
|
|
||||||
#define cudaMemGetInfo hipMemGetInfo
|
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
|
||||||
#define cudaSetDevice hipSetDevice
|
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
||||||
#define cudaStreamDestroy hipStreamDestroy
|
|
||||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
||||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
|
||||||
#define cudaStreamPerThread hipStreamPerThread
|
|
||||||
#define cudaStreamSynchronize hipStreamSynchronize
|
|
||||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
|
||||||
#define cudaStream_t hipStream_t
|
|
||||||
#define cudaSuccess hipSuccess
|
|
||||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
|
||||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
|
||||||
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
|
||||||
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
|
||||||
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
|
||||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
|
||||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
|
||||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
#include <musa_runtime.h>
|
#include "vendors/musa.h"
|
||||||
#include <musa.h>
|
|
||||||
#include <mublas.h>
|
|
||||||
#include <musa_fp16.h>
|
|
||||||
// XXX: Keep the following order the same as hipBLAS
|
|
||||||
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
|
|
||||||
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
|
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
|
||||||
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
|
||||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
|
||||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
|
||||||
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
||||||
#define CUDA_R_16F MUSA_R_16F
|
|
||||||
#define CUDA_R_32F MUSA_R_32F
|
|
||||||
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
||||||
// #define cublasComputeType_t mublasComputeType_t
|
|
||||||
#define cublasCreate mublasCreate
|
|
||||||
#define cublasDestroy mublasDestroy
|
|
||||||
#define cublasGemmEx mublasGemmEx
|
|
||||||
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
|
||||||
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
|
||||||
#define cublasHandle_t mublasHandle_t
|
|
||||||
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
||||||
#define cublasSetMathMode mublasSetMathMode
|
|
||||||
#define cublasSetStream mublasSetStream
|
|
||||||
#define cublasSgemm mublasSgemm
|
|
||||||
#define cublasStatus_t mublasStatus_t
|
|
||||||
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
|
|
||||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
|
||||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
|
||||||
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
|
||||||
#define cudaDeviceProp musaDeviceProp
|
|
||||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
|
||||||
#define cudaError_t musaError_t
|
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
|
||||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
|
||||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
|
||||||
#define cudaEventDisableTiming musaEventDisableTiming
|
|
||||||
#define cudaEventRecord musaEventRecord
|
|
||||||
#define cudaEventSynchronize musaEventSynchronize
|
|
||||||
#define cudaEvent_t musaEvent_t
|
|
||||||
#define cudaEventDestroy musaEventDestroy
|
|
||||||
#define cudaFree musaFree
|
|
||||||
#define cudaFreeHost musaFreeHost
|
|
||||||
#define cudaGetDevice musaGetDevice
|
|
||||||
#define cudaGetDeviceCount musaGetDeviceCount
|
|
||||||
#define cudaGetDeviceProperties musaGetDeviceProperties
|
|
||||||
#define cudaGetErrorString musaGetErrorString
|
|
||||||
#define cudaGetLastError musaGetLastError
|
|
||||||
#define cudaHostRegister musaHostRegister
|
|
||||||
#define cudaHostRegisterPortable musaHostRegisterPortable
|
|
||||||
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
|
||||||
#define cudaHostUnregister musaHostUnregister
|
|
||||||
#define cudaLaunchHostFunc musaLaunchHostFunc
|
|
||||||
#define cudaMalloc musaMalloc
|
|
||||||
#define cudaMallocHost musaMallocHost
|
|
||||||
#define cudaMemcpy musaMemcpy
|
|
||||||
#define cudaMemcpyAsync musaMemcpyAsync
|
|
||||||
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
|
||||||
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
|
||||||
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
|
||||||
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
|
||||||
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
|
||||||
#define cudaMemcpyKind musaMemcpyKind
|
|
||||||
#define cudaMemset musaMemset
|
|
||||||
#define cudaMemsetAsync musaMemsetAsync
|
|
||||||
#define cudaMemGetInfo musaMemGetInfo
|
|
||||||
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
|
||||||
#define cudaSetDevice musaSetDevice
|
|
||||||
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
|
||||||
#define cudaStreamDestroy musaStreamDestroy
|
|
||||||
#define cudaStreamFireAndForget musaStreamFireAndForget
|
|
||||||
#define cudaStreamNonBlocking musaStreamNonBlocking
|
|
||||||
#define cudaStreamPerThread musaStreamPerThread
|
|
||||||
#define cudaStreamSynchronize musaStreamSynchronize
|
|
||||||
#define cudaStreamWaitEvent musaStreamWaitEvent
|
|
||||||
#define cudaStream_t musaStream_t
|
|
||||||
#define cudaSuccess musaSuccess
|
|
||||||
|
|
||||||
// XXX: Other CUDA => MUSA mapping
|
|
||||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
|
||||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
|
||||||
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
|
||||||
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
|
||||||
#define CUdevice MUdevice
|
|
||||||
#define CUdeviceptr MUdeviceptr
|
|
||||||
#define CUmemAccessDesc MUmemAccessDesc
|
|
||||||
#define CUmemAllocationProp MUmemAllocationProp
|
|
||||||
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
|
||||||
#define cuDeviceGet muDeviceGet
|
|
||||||
#define cuDeviceGetAttribute muDeviceGetAttribute
|
|
||||||
#define cuMemAddressFree muMemAddressFree
|
|
||||||
#define cuMemAddressReserve muMemAddressReserve
|
|
||||||
#define cuMemCreate muMemCreate
|
|
||||||
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
|
||||||
#define cuMemMap muMemMap
|
|
||||||
#define cuMemRelease muMemRelease
|
|
||||||
#define cuMemSetAccess muMemSetAccess
|
|
||||||
#define cuMemUnmap muMemUnmap
|
|
||||||
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
|
||||||
#define cudaFuncSetAttribute musaFuncSetAttribute
|
|
||||||
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
|
||||||
#define make_cudaExtent make_musaExtent
|
|
||||||
#define make_cudaPitchedPtr make_musaPitchedPtr
|
|
||||||
|
|
||||||
// XXX: USE_CUDA_GRAPH
|
|
||||||
#define CUDA_SUCCESS MUSA_SUCCESS
|
|
||||||
#define CUresult MUresult
|
|
||||||
#define cuGetErrorString muGetErrorString
|
|
||||||
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
|
||||||
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
|
||||||
#define cudaGraphDestroy musaGraphDestroy
|
|
||||||
#define cudaGraphExecDestroy musaGraphExecDestroy
|
|
||||||
#define cudaGraphExec_t musaGraphExec_t
|
|
||||||
#define cudaGraphExecUpdate musaGraphExecUpdate
|
|
||||||
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
|
||||||
#define cudaGraphGetNodes musaGraphGetNodes
|
|
||||||
#define cudaGraphInstantiate musaGraphInstantiate
|
|
||||||
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
|
||||||
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
|
||||||
#define cudaGraphLaunch musaGraphLaunch
|
|
||||||
#define cudaGraphNodeGetType musaGraphNodeGetType
|
|
||||||
#define cudaGraphNode_t musaGraphNode_t
|
|
||||||
#define cudaGraphNodeType musaGraphNodeType
|
|
||||||
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
|
||||||
#define cudaGraph_t musaGraph_t
|
|
||||||
#define cudaKernelNodeParams musaKernelNodeParams
|
|
||||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
|
||||||
#define cudaStreamEndCapture musaStreamEndCapture
|
|
||||||
|
|
||||||
// XXX: cuBLAS => muBLAS mapping
|
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
||||||
#define cublasComputeType_t cudaDataType_t
|
|
||||||
|
|
||||||
// XXX: Clang builtins mapping
|
|
||||||
#define __vsub4 __vsub4_musa
|
|
||||||
#define __vcmpeq4 __vcmpeq4_musa
|
|
||||||
#define __vcmpne4 __vcmpne4_musa
|
|
||||||
#else
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include "vendors/cuda.h"
|
||||||
#include <cuda.h>
|
|
||||||
#include <cublas_v2.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
|
|
||||||
#if CUDART_VERSION < 11020
|
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
||||||
#define cublasComputeType_t cudaDataType_t
|
|
||||||
#endif // CUDART_VERSION < 11020
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|||||||
|
|
||||||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
#ifndef GGML_USE_MUSA
|
|
||||||
return cublasGetStatusString(err);
|
return cublasGetStatusString(err);
|
||||||
#else
|
|
||||||
return mublasStatus_to_string(err);
|
|
||||||
#endif // GGML_USE_MUSA
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
@ -364,129 +116,7 @@ typedef half2 dfloat2;
|
|||||||
#else
|
#else
|
||||||
typedef float dfloat; // dequantize float
|
typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
#if defined(GGML_USE_MUSA)
|
|
||||||
#ifndef __has_builtin
|
|
||||||
#define __has_builtin(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
|
||||||
return __vsubss4(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
#endif // defined(GGML_USE_MUSA)
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
|
||||||
#define __CUDA_ARCH__ 1300
|
|
||||||
|
|
||||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
||||||
defined(__gfx1150__) || defined(__gfx1151__)
|
|
||||||
#define RDNA3
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
|
||||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
|
||||||
#define RDNA2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
|
||||||
#define RDNA1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __has_builtin
|
|
||||||
#define __has_builtin(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
||||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
||||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
||||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
|
||||||
return reinterpret_cast<const int &>(c);
|
|
||||||
#else
|
|
||||||
int8x4_t c;
|
|
||||||
int16_t tmp;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
tmp = va[i] - vb[i];
|
|
||||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
|
||||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
|
||||||
c[i] = tmp;
|
|
||||||
}
|
|
||||||
return reinterpret_cast<int &>(c);
|
|
||||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
||||||
return __vsubss4(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
|
||||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
|
||||||
typedef union half2_b32 {
|
|
||||||
half2 val;
|
|
||||||
int b32;
|
|
||||||
} half2_b32_t;
|
|
||||||
half2_b32_t tmp;
|
|
||||||
tmp.val = var;
|
|
||||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
|
||||||
return tmp.val;
|
|
||||||
}
|
|
||||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
|
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 11020
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#endif // CUDART_VERSION < 11020
|
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
|
// for rocblas_initialize()
|
||||||
|
#include "rocblas/rocblas.h"
|
||||||
|
#endif // __HIP_PLATFORM_AMD__
|
||||||
|
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||||
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||||
|
#define cublasCreate hipblasCreate
|
||||||
|
#define cublasDestroy hipblasDestroy
|
||||||
|
#define cublasGemmEx hipblasGemmEx
|
||||||
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t hipblasHandle_t
|
||||||
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||||
|
#define cublasSetStream hipblasSetStream
|
||||||
|
#define cublasSgemm hipblasSgemm
|
||||||
|
#define cublasStatus_t hipblasStatus_t
|
||||||
|
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||||
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
|
#define cudaEventRecord hipEventRecord
|
||||||
|
#define cudaEventSynchronize hipEventSynchronize
|
||||||
|
#define cudaEvent_t hipEvent_t
|
||||||
|
#define cudaEventDestroy hipEventDestroy
|
||||||
|
#define cudaFree hipFree
|
||||||
|
#define cudaFreeHost hipHostFree
|
||||||
|
#define cudaGetDevice hipGetDevice
|
||||||
|
#define cudaGetDeviceCount hipGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
|
#define cudaGetErrorString hipGetErrorString
|
||||||
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaHostRegister hipHostRegister
|
||||||
|
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister hipHostUnregister
|
||||||
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
|
#define cudaMalloc hipMalloc
|
||||||
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#define cudaMemcpy hipMemcpy
|
||||||
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
|
#define cudaMemGetInfo hipMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice hipSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy hipStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread hipStreamPerThread
|
||||||
|
#define cudaStreamSynchronize hipStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||||
|
#define cudaStream_t hipStream_t
|
||||||
|
#define cudaSuccess hipSuccess
|
||||||
|
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||||
|
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||||
|
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||||
|
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||||
|
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||||
|
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||||
|
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||||
|
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||||
|
|
||||||
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
#define RDNA3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||||
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||||
|
#define RDNA2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||||
|
#define RDNA1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||||
|
return reinterpret_cast<const int &>(c);
|
||||||
|
#else
|
||||||
|
int8x4_t c;
|
||||||
|
int16_t tmp;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
tmp = va[i] - vb[i];
|
||||||
|
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||||
|
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||||
|
c[i] = tmp;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<int &>(c);
|
||||||
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||||
|
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||||
|
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||||
|
typedef union half2_b32 {
|
||||||
|
half2 val;
|
||||||
|
int b32;
|
||||||
|
} half2_b32_t;
|
||||||
|
half2_b32_t tmp;
|
||||||
|
tmp.val = var;
|
||||||
|
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||||
|
return tmp.val;
|
||||||
|
}
|
||||||
|
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <musa_runtime.h>
|
||||||
|
#include <musa.h>
|
||||||
|
#include <mublas.h>
|
||||||
|
#include <musa_fp16.h>
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||||
|
#define CUDA_R_16F MUSA_R_16F
|
||||||
|
#define CUDA_R_32F MUSA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#define cublasCreate mublasCreate
|
||||||
|
#define cublasDestroy mublasDestroy
|
||||||
|
#define cublasGemmEx mublasGemmEx
|
||||||
|
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t mublasHandle_t
|
||||||
|
#define cublasSetMathMode mublasSetMathMode
|
||||||
|
#define cublasSetStream mublasSetStream
|
||||||
|
#define cublasSgemm mublasSgemm
|
||||||
|
#define cublasStatus_t mublasStatus_t
|
||||||
|
#define cublasGetStatusString mublasStatus_to_string
|
||||||
|
#define cudaDataType_t musaDataType_t
|
||||||
|
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp musaDeviceProp
|
||||||
|
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||||
|
#define cudaError_t musaError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming musaEventDisableTiming
|
||||||
|
#define cudaEventRecord musaEventRecord
|
||||||
|
#define cudaEventSynchronize musaEventSynchronize
|
||||||
|
#define cudaEvent_t musaEvent_t
|
||||||
|
#define cudaEventDestroy musaEventDestroy
|
||||||
|
#define cudaFree musaFree
|
||||||
|
#define cudaFreeHost musaFreeHost
|
||||||
|
#define cudaGetDevice musaGetDevice
|
||||||
|
#define cudaGetDeviceCount musaGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||||
|
#define cudaGetErrorString musaGetErrorString
|
||||||
|
#define cudaGetLastError musaGetLastError
|
||||||
|
#define cudaHostRegister musaHostRegister
|
||||||
|
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister musaHostUnregister
|
||||||
|
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||||
|
#define cudaMalloc musaMalloc
|
||||||
|
#define cudaMallocHost musaMallocHost
|
||||||
|
#define cudaMemcpy musaMemcpy
|
||||||
|
#define cudaMemcpyAsync musaMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind musaMemcpyKind
|
||||||
|
#define cudaMemset musaMemset
|
||||||
|
#define cudaMemsetAsync musaMemsetAsync
|
||||||
|
#define cudaMemGetInfo musaMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice musaSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy musaStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread musaStreamPerThread
|
||||||
|
#define cudaStreamSynchronize musaStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||||
|
#define cudaStream_t musaStream_t
|
||||||
|
#define cudaSuccess musaSuccess
|
||||||
|
|
||||||
|
// Additional mappings for MUSA virtual memory pool
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||||
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||||
|
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||||
|
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||||
|
#define CUdevice MUdevice
|
||||||
|
#define CUdeviceptr MUdeviceptr
|
||||||
|
#define CUmemAccessDesc MUmemAccessDesc
|
||||||
|
#define CUmemAllocationProp MUmemAllocationProp
|
||||||
|
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||||
|
#define cuDeviceGet muDeviceGet
|
||||||
|
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||||
|
#define cuMemAddressFree muMemAddressFree
|
||||||
|
#define cuMemAddressReserve muMemAddressReserve
|
||||||
|
#define cuMemCreate muMemCreate
|
||||||
|
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||||
|
#define cuMemMap muMemMap
|
||||||
|
#define cuMemRelease muMemRelease
|
||||||
|
#define cuMemSetAccess muMemSetAccess
|
||||||
|
#define cuMemUnmap muMemUnmap
|
||||||
|
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||||
|
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||||
|
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||||
|
#define make_cudaExtent make_musaExtent
|
||||||
|
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||||
|
|
||||||
|
// Additional mappings for MUSA graphs
|
||||||
|
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||||
|
#define CUresult MUresult
|
||||||
|
#define cuGetErrorString muGetErrorString
|
||||||
|
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||||
|
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||||
|
#define cudaGraphDestroy musaGraphDestroy
|
||||||
|
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||||
|
#define cudaGraphExec_t musaGraphExec_t
|
||||||
|
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||||
|
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||||
|
#define cudaGraphGetNodes musaGraphGetNodes
|
||||||
|
#define cudaGraphInstantiate musaGraphInstantiate
|
||||||
|
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||||
|
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||||
|
#define cudaGraphLaunch musaGraphLaunch
|
||||||
|
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||||
|
#define cudaGraphNode_t musaGraphNode_t
|
||||||
|
#define cudaGraphNodeType musaGraphNodeType
|
||||||
|
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||||
|
#define cudaGraph_t musaGraph_t
|
||||||
|
#define cudaKernelNodeParams musaKernelNodeParams
|
||||||
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||||
|
#define cudaStreamEndCapture musaStreamEndCapture
|
||||||
|
|
||||||
|
// XXX: Clang builtins mapping
|
||||||
|
#define __vsub4 __vsub4_musa
|
||||||
|
#define __vcmpeq4 __vcmpeq4_musa
|
||||||
|
#define __vcmpne4 __vcmpne4_musa
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user