mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-10-30 06:30:15 +01:00
cuda : add half2 __shfl_xor() for ROCm 5.5 (#7263)
This commit is contained in:
parent
0f98acfac6
commit
d233b507cd
@ -315,6 +315,20 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|||||||
#endif
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||||
|
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||||
|
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||||
|
typedef union half2_b32 {
|
||||||
|
half2 val;
|
||||||
|
int b32;
|
||||||
|
} half2_b32_t;
|
||||||
|
half2_b32_t tmp;
|
||||||
|
tmp.val = var;
|
||||||
|
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||||
|
return tmp.val;
|
||||||
|
}
|
||||||
|
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
Loading…
Reference in New Issue
Block a user