mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 22:08:55 +01:00
sycl: Use syclcompat::dp4a (#10267)
* sycl: Use syclcompat::dp4a
* Using the syclcompat version allow the compiler to optimize the
operation with native function
* Update news section
* Update CI Windows oneAPI version to 2025.0
* Reword doc
* Call syclcompat::dp4a inside dpct::dp4a
This reverts commit 90cb61d692
.
This commit is contained in:
parent
1607a5e5b0
commit
5a54af4d4f
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -930,7 +930,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
|
|
||||||
env:
|
env:
|
||||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
|
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||||
steps:
|
steps:
|
||||||
|
@ -41,6 +41,8 @@ The following release is verified with good quality:
|
|||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.11
|
||||||
|
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
|
||||||
|
|
||||||
- 2024.8
|
- 2024.8
|
||||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
#include <sycl/sycl.hpp>
|
#include <sycl/sycl.hpp>
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
#include <syclcompat/math.hpp>
|
||||||
#include <oneapi/mkl.hpp>
|
#include <oneapi/mkl.hpp>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
@ -1830,31 +1831,10 @@ namespace dpct
|
|||||||
: id);
|
: id);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
|
|
||||||
{
|
|
||||||
return sycl::vec<T, 1>(val)
|
|
||||||
.template as<sycl::vec<
|
|
||||||
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
|
|
||||||
.template convert<T>();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T1, typename T2>
|
|
||||||
using dot_product_acc_t =
|
|
||||||
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
|
|
||||||
uint32_t, int32_t>;
|
|
||||||
|
|
||||||
template <typename T1, typename T2, typename T3>
|
template <typename T1, typename T2, typename T3>
|
||||||
inline auto dp4a(T1 a, T2 b, T3 c)
|
inline auto dp4a(T1 a, T2 b, T3 c)
|
||||||
{
|
{
|
||||||
dot_product_acc_t<T1, T2> res = c;
|
return syclcompat::dp4a(a, b, c);
|
||||||
auto va = extract_and_sign_or_zero_extend4(a);
|
|
||||||
auto vb = extract_and_sign_or_zero_extend4(b);
|
|
||||||
res += va[0] * vb[0];
|
|
||||||
res += va[1] * vb[1];
|
|
||||||
res += va[2] * vb[2];
|
|
||||||
res += va[3] * vb[3];
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sub_sat
|
struct sub_sat
|
||||||
|
@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|||||||
grid1[0] ^ signs[0], signs[0], std::minus<>());
|
grid1[0] ^ signs[0], signs[0], std::minus<>());
|
||||||
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
||||||
grid2[0] ^ signs[1], signs[1], std::minus<>());
|
grid2[0] ^ signs[1], signs[1], std::minus<>());
|
||||||
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
|
||||||
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
|
||||||
q8 += 8;
|
q8 += 8;
|
||||||
aux32 >>= 7;
|
aux32 >>= 7;
|
||||||
}
|
}
|
||||||
@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|||||||
grid1[0] ^ signs0, signs0, std::minus<>());
|
grid1[0] ^ signs0, signs0, std::minus<>());
|
||||||
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
||||||
grid2[0] ^ signs1, signs1, std::minus<>());
|
grid2[0] ^ signs1, signs1, std::minus<>());
|
||||||
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
|
||||||
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
|
||||||
q8 += 8;
|
q8 += 8;
|
||||||
}
|
}
|
||||||
const float d =
|
const float d =
|
||||||
|
Loading…
Reference in New Issue
Block a user