mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
Fix f16_sycl cpy call from Arc (#5411)
* fix f16_sycl cpy call * rm old logic * add fp16 build CI * use macro * format fix
This commit is contained in:
parent
ff4ff05c5f
commit
6e99f2a04f
41
.github/workflows/build.yml
vendored
41
.github/workflows/build.yml
vendored
@ -184,6 +184,47 @@ jobs:
|
|||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-22-cmake-sycl-fp16:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: add oneAPI to apt
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd /tmp
|
||||||
|
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||||
|
|
||||||
|
- name: install oneAPI dpcpp compiler
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||||
|
|
||||||
|
- name: install oneAPI MKL library
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt install intel-oneapi-mkl-devel
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
|
||||||
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
|
@ -12148,7 +12148,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|||||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||||
const dpct::queue_ptr &stream) {
|
const dpct::queue_ptr &stream) {
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||||
@ -12167,8 +12168,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|||||||
} else {
|
} else {
|
||||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||||
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
|
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
|
||||||
ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
|
ne00, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||||
sizeof(sycl::half), 0, 0, stream);
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12,
|
||||||
|
nb13, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
Loading…
x
Reference in New Issue
Block a user