mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 21:10:24 +01:00
[SYCL] Fix windows build and inference (#8003)
* add sycl preset * fix debug link error. fix windows crash * update README
This commit is contained in:
parent
d50f8897a7
commit
de391e4c80
@ -665,6 +665,7 @@ if (LLAMA_SYCL)
|
|||||||
#todo: AOT
|
#todo: AOT
|
||||||
|
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
|
||||||
message(STATUS "SYCL found")
|
message(STATUS "SYCL found")
|
||||||
|
|
||||||
@ -679,11 +680,9 @@ if (LLAMA_SYCL)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
endif()
|
endif()
|
||||||
@ -693,8 +692,10 @@ if (LLAMA_SYCL)
|
|||||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
@ -11,9 +11,21 @@
|
|||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"LLAMA_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -35,15 +47,18 @@
|
|||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -410,15 +410,9 @@ Output (example):
|
|||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
|||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
|||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
|
|||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
||||||
:: build all binary
|
:: build all binary
|
||||||
make -j
|
cmake --build . -j
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
|
@ -4911,7 +4911,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
@ -589,94 +589,75 @@ namespace dpct
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// dpct device extension
|
/// dpct device extension
|
||||||
class device_ext : public sycl::device
|
class device_ext : public sycl::device {
|
||||||
{
|
|
||||||
typedef std::mutex mutex_type;
|
typedef std::mutex mutex_type;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
device_ext() : sycl::device(), _ctx(*this) {}
|
device_ext() : sycl::device() {}
|
||||||
~device_ext()
|
~device_ext() {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
clear_queues();
|
clear_queues();
|
||||||
}
|
}
|
||||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
|
device_ext(const sycl::device &base) : sycl::device(base) {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
init_queues();
|
init_queues();
|
||||||
}
|
}
|
||||||
|
|
||||||
int is_native_atomic_supported() { return 0; }
|
int is_native_atomic_supported() { return 0; }
|
||||||
int get_major_version() const
|
int get_major_version() const { return dpct::get_major_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_major_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_minor_version() const
|
int get_minor_version() const { return dpct::get_minor_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_minor_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_compute_units() const
|
int get_max_compute_units() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_compute_units();
|
return get_device_info().get_max_compute_units();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the maximum clock frequency of this device in KHz.
|
/// Return the maximum clock frequency of this device in KHz.
|
||||||
int get_max_clock_frequency() const
|
int get_max_clock_frequency() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_clock_frequency();
|
return get_device_info().get_max_clock_frequency();
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||||
|
|
||||||
int get_max_sub_group_size() const
|
int get_max_sub_group_size() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_sub_group_size();
|
return get_device_info().get_max_sub_group_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_max_register_size_per_work_group() const
|
int get_max_register_size_per_work_group() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_register_size_per_work_group();
|
return get_device_info().get_max_register_size_per_work_group();
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_max_work_group_size() const
|
int get_max_work_group_size() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_work_group_size();
|
return get_device_info().get_max_work_group_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_mem_base_addr_align() const
|
int get_mem_base_addr_align() const {
|
||||||
{
|
|
||||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t get_global_mem_size() const
|
size_t get_global_mem_size() const {
|
||||||
{
|
|
||||||
return get_device_info().get_global_mem_size();
|
return get_device_info().get_global_mem_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t get_max_mem_alloc_size() const
|
size_t get_max_mem_alloc_size() const {
|
||||||
{
|
|
||||||
return get_device_info().get_max_mem_alloc_size();
|
return get_device_info().get_max_mem_alloc_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
/// \param [out] free_memory The number of bytes of free memory on the
|
||||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
/// SYCL device. \param [out] total_memory The number of bytes of total
|
||||||
void get_memory_info(size_t &free_memory, size_t &total_memory)
|
/// memory on the SYCL device.
|
||||||
{
|
void get_memory_info(size_t &free_memory, size_t &total_memory) {
|
||||||
total_memory = get_device_info().get_global_mem_size();
|
total_memory = get_device_info().get_global_mem_size();
|
||||||
const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
|
const char *warning_info =
|
||||||
|
"get_memory_info: [warning] ext_intel_free_memory is not "
|
||||||
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
||||||
"use total memory as free memory";
|
"use total memory as free memory";
|
||||||
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
||||||
if (!has(sycl::aspect::ext_intel_free_memory))
|
if (!has(sycl::aspect::ext_intel_free_memory)) {
|
||||||
{
|
|
||||||
std::cerr << warning_info << std::endl;
|
std::cerr << warning_info << std::endl;
|
||||||
free_memory = total_memory;
|
free_memory = total_memory;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -690,164 +671,139 @@ namespace dpct
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_device_info(device_info &out) const
|
void get_device_info(device_info &out) const {
|
||||||
{
|
|
||||||
dpct::get_device_info(out, *this);
|
dpct::get_device_info(out, *this);
|
||||||
}
|
}
|
||||||
|
|
||||||
device_info get_device_info() const
|
device_info get_device_info() const {
|
||||||
{
|
|
||||||
device_info prop;
|
device_info prop;
|
||||||
dpct::get_device_info(prop, *this);
|
dpct::get_device_info(prop, *this);
|
||||||
return prop;
|
return prop;
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset()
|
void reset() {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
clear_queues();
|
clear_queues();
|
||||||
init_queues();
|
init_queues();
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue &in_order_queue() { return *_q_in_order; }
|
sycl::queue &in_order_queue() { return _q_in_order; }
|
||||||
|
|
||||||
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
|
sycl::queue &out_of_order_queue() { return _q_out_of_order; }
|
||||||
|
|
||||||
sycl::queue &default_queue()
|
sycl::queue &default_queue() { return in_order_queue(); }
|
||||||
{
|
|
||||||
return in_order_queue();
|
|
||||||
}
|
|
||||||
|
|
||||||
void queues_wait_and_throw()
|
void queues_wait_and_throw() {
|
||||||
{
|
|
||||||
std::unique_lock<mutex_type> lock(m_mutex);
|
std::unique_lock<mutex_type> lock(m_mutex);
|
||||||
std::vector<std::shared_ptr<sycl::queue>> current_queues(
|
|
||||||
_queues);
|
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
for (const auto &q : current_queues)
|
for (auto &q : _queues) {
|
||||||
{
|
q.wait_and_throw();
|
||||||
q->wait_and_throw();
|
|
||||||
}
|
}
|
||||||
// Guard the destruct of current_queues to make sure the ref count is safe.
|
// Guard the destruct of current_queues to make sure the ref count is
|
||||||
|
// safe.
|
||||||
lock.lock();
|
lock.lock();
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_queue(bool enable_exception_handler = false)
|
sycl::queue create_queue(bool enable_exception_handler = false) {
|
||||||
{
|
|
||||||
return create_in_order_queue(enable_exception_handler);
|
return create_in_order_queue(enable_exception_handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
sycl::queue create_queue(sycl::device device,
|
||||||
bool enable_exception_handler = false) {
|
bool enable_exception_handler = false) {
|
||||||
return create_in_order_queue(context, device, enable_exception_handler);
|
return create_in_order_queue(device, enable_exception_handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return create_queue_impl(enable_exception_handler,
|
return create_queue_impl(enable_exception_handler,
|
||||||
sycl::property::queue::in_order());
|
sycl::property::queue::in_order());
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
|
sycl::queue create_in_order_queue(sycl::device device,
|
||||||
bool enable_exception_handler = false) {
|
bool enable_exception_handler = false) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return create_queue_impl(context, device, enable_exception_handler,
|
return create_queue_impl(device, enable_exception_handler,
|
||||||
sycl::property::queue::in_order());
|
sycl::property::queue::in_order());
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
|
sycl::queue create_out_of_order_queue(
|
||||||
|
bool enable_exception_handler = false) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return create_queue_impl(enable_exception_handler);
|
return create_queue_impl(enable_exception_handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
void destroy_queue(sycl::queue *&queue)
|
void destroy_queue(sycl::queue queue) {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
_queues.clear();
|
||||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
|
||||||
{
|
|
||||||
return q.get() == queue;
|
|
||||||
}),
|
|
||||||
_queues.end());
|
|
||||||
queue = nullptr;
|
|
||||||
}
|
}
|
||||||
void set_saved_queue(sycl::queue *q)
|
void set_saved_queue(sycl::queue q) {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
_saved_queue = q;
|
_saved_queue = q;
|
||||||
}
|
}
|
||||||
sycl::queue *get_saved_queue() const
|
sycl::queue get_saved_queue() const {
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return _saved_queue;
|
return _saved_queue;
|
||||||
}
|
}
|
||||||
sycl::context get_context() const { return _ctx; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void clear_queues()
|
void clear_queues() { _queues.clear(); }
|
||||||
{
|
|
||||||
_queues.clear();
|
|
||||||
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_queues()
|
void init_queues() {
|
||||||
{
|
_q_in_order =
|
||||||
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
|
create_queue_impl(true, sycl::property::queue::in_order());
|
||||||
_q_out_of_order = create_queue_impl(true);
|
_q_out_of_order = create_queue_impl(true);
|
||||||
_saved_queue = &default_queue();
|
_saved_queue = default_queue();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
/// Caller should acquire resource \p m_mutex before calling this
|
||||||
|
/// function.
|
||||||
template <class... Properties>
|
template <class... Properties>
|
||||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
sycl::queue create_queue_impl(bool enable_exception_handler,
|
||||||
Properties... properties)
|
Properties... properties) {
|
||||||
{
|
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler)
|
if (enable_exception_handler) {
|
||||||
{
|
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
}
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
auto q = sycl::queue(*this, eh,
|
||||||
_ctx, *this, eh,
|
|
||||||
sycl::property_list(
|
sycl::property_list(
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
#endif
|
#endif
|
||||||
properties...)));
|
properties...));
|
||||||
|
_queues.push_back(q);
|
||||||
|
|
||||||
return _queues.back().get();
|
return _queues.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class... Properties>
|
template <class... Properties>
|
||||||
sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
|
sycl::queue create_queue_impl(sycl::device device,
|
||||||
bool enable_exception_handler,
|
bool enable_exception_handler,
|
||||||
Properties... properties) {
|
Properties... properties) {
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler) {
|
if (enable_exception_handler) {
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
}
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
_queues.push_back(
|
||||||
context, device, eh,
|
sycl::queue(device, eh,
|
||||||
sycl::property_list(
|
sycl::property_list(
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
#endif
|
#endif
|
||||||
properties...)));
|
properties...)));
|
||||||
|
|
||||||
return _queues.back().get();
|
return _queues.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_version(int &major, int &minor) const
|
void get_version(int &major, int &minor) const {
|
||||||
{
|
|
||||||
detail::get_version(*this, major, minor);
|
detail::get_version(*this, major, minor);
|
||||||
}
|
}
|
||||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
sycl::queue _q_in_order, _q_out_of_order;
|
||||||
sycl::queue *_saved_queue;
|
sycl::queue _saved_queue;
|
||||||
sycl::context _ctx;
|
std::vector<sycl::queue> _queues;
|
||||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
|
||||||
mutable mutex_type m_mutex;
|
mutable mutex_type m_mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/// device manager
|
/// device manager
|
||||||
class dev_mgr
|
class dev_mgr
|
||||||
{
|
{
|
||||||
|
6
ggml.h
6
ggml.h
@ -312,6 +312,12 @@
|
|||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user