mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 22:38:58 +01:00
[SYCL] fix multi-gpu issue on sycl (#8554)
--------- Signed-off-by: Chen Xi <xi2chen@intel.com> Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
This commit is contained in:
parent
eddcb5238b
commit
ed67bcb24f
@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
|||||||
```sh
|
```sh
|
||||||
./build/bin/llama-ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
```
|
```
|
||||||
found 6 SYCL devices:
|
found 2 SYCL devices:
|
||||||
|
|
||||||
| | | |Compute |Max compute|Max work|Max sub| |
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
|
||||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
|
||||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
|
||||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
|
||||||
|------------------------|-------------------------------------------------------------|
|
|
||||||
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
|
||||||
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
|
||||||
There are two device selection modes:
|
There are two device selection modes:
|
||||||
|
|
||||||
- Single device: Use one device target specified by the user.
|
- Single device: Use one device target specified by the user.
|
||||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
- Multiple devices: Automatically choose the devices with the same backend.
|
||||||
|
|
||||||
|
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|------------------|----------------------------------------|
|
|------------------|----------------------------------------|
|
||||||
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
|||||||
build\bin\ls-sycl-device.exe
|
build\bin\ls-sycl-device.exe
|
||||||
```
|
```
|
||||||
|
|
||||||
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
```
|
```
|
||||||
found 6 SYCL devices:
|
found 2 SYCL devices:
|
||||||
| | | |Compute |Max compute|Max work|Max sub| |
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
|
||||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
|
||||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
|
||||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
|
||||||
|------------------------|-----------------------------------------------------------|
|
|
||||||
| compute capability 1.3 | Level-zero running time, recommended |
|
|
||||||
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
|
||||||
|
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
|
||||||
There are two device selection modes:
|
There are two device selection modes:
|
||||||
|
|
||||||
- Single device: Use one device assigned by user.
|
- Single device: Use one device assigned by user. Default device id is 0.
|
||||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
- Multiple devices: Automatically choose the devices with the same backend.
|
||||||
|
|
||||||
|
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|------------------|----------------------------------------|
|
|------------------|----------------------------------------|
|
||||||
|
@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
|
|||||||
|
|
||||||
queue_ptr stream(int device, int stream) {
|
queue_ptr stream(int device, int stream) {
|
||||||
if (qptrs[device][stream] == nullptr) {
|
if (qptrs[device][stream] == nullptr) {
|
||||||
qptrs[device][stream] = &(dpct::get_current_device().default_queue());
|
qptrs[device][stream] = &(dpct::get_device(device).default_queue());
|
||||||
}
|
}
|
||||||
return qptrs[device][stream];
|
return qptrs[device][stream];
|
||||||
}
|
}
|
||||||
|
@ -588,7 +588,7 @@ namespace dpct
|
|||||||
out = prop;
|
out = prop;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// dpct device extension
|
/// dpct device extension
|
||||||
class device_ext : public sycl::device {
|
class device_ext : public sycl::device {
|
||||||
typedef std::mutex mutex_type;
|
typedef std::mutex mutex_type;
|
||||||
|
|
||||||
@ -697,7 +697,7 @@ namespace dpct
|
|||||||
std::unique_lock<mutex_type> lock(m_mutex);
|
std::unique_lock<mutex_type> lock(m_mutex);
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
for (auto &q : _queues) {
|
for (auto &q : _queues) {
|
||||||
q.wait_and_throw();
|
q.wait_and_throw();
|
||||||
}
|
}
|
||||||
// Guard the destruct of current_queues to make sure the ref count is
|
// Guard the destruct of current_queues to make sure the ref count is
|
||||||
// safe.
|
// safe.
|
||||||
@ -734,7 +734,12 @@ namespace dpct
|
|||||||
|
|
||||||
void destroy_queue(sycl::queue queue) {
|
void destroy_queue(sycl::queue queue) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
_queues.clear();
|
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||||
|
[=](const sycl::queue &q) -> bool
|
||||||
|
{
|
||||||
|
return q == queue;
|
||||||
|
}),
|
||||||
|
_queues.end());
|
||||||
}
|
}
|
||||||
void set_saved_queue(sycl::queue q) {
|
void set_saved_queue(sycl::queue q) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
@ -764,13 +769,13 @@ namespace dpct
|
|||||||
if (enable_exception_handler) {
|
if (enable_exception_handler) {
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
}
|
||||||
auto q = sycl::queue(*this, eh,
|
_queues.push_back(sycl::queue(
|
||||||
sycl::property_list(
|
*this, eh,
|
||||||
|
sycl::property_list(
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
#endif
|
#endif
|
||||||
properties...));
|
properties...)));
|
||||||
_queues.push_back(q);
|
|
||||||
|
|
||||||
return _queues.back();
|
return _queues.back();
|
||||||
}
|
}
|
||||||
@ -783,8 +788,8 @@ namespace dpct
|
|||||||
if (enable_exception_handler) {
|
if (enable_exception_handler) {
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
}
|
||||||
_queues.push_back(
|
_queues.push_back(sycl::queue(
|
||||||
sycl::queue(device, eh,
|
device, eh,
|
||||||
sycl::property_list(
|
sycl::property_list(
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
@ -855,15 +860,75 @@ namespace dpct
|
|||||||
unsigned int get_device_id(const sycl::device &dev)
|
unsigned int get_device_id(const sycl::device &dev)
|
||||||
{
|
{
|
||||||
unsigned int id = 0;
|
unsigned int id = 0;
|
||||||
for (auto dev_item : _devs)
|
for (auto &dev_item : _devs)
|
||||||
{
|
{
|
||||||
if (*dev_item == dev)
|
if (*dev_item == dev)
|
||||||
{
|
{
|
||||||
break;
|
return id;
|
||||||
}
|
}
|
||||||
id++;
|
id++;
|
||||||
}
|
}
|
||||||
return id;
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string get_preferred_gpu_platform_name() {
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
std::string filter = "level-zero";
|
||||||
|
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
|
||||||
|
if (env) {
|
||||||
|
if (std::strstr(env, "level_zero")) {
|
||||||
|
filter = "level-zero";
|
||||||
|
}
|
||||||
|
else if (std::strstr(env, "opencl")) {
|
||||||
|
filter = "opencl";
|
||||||
|
}
|
||||||
|
else if (std::strstr(env, "cuda")) {
|
||||||
|
filter = "cuda";
|
||||||
|
}
|
||||||
|
else if (std::strstr(env, "hip")) {
|
||||||
|
filter = "hip";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw std::runtime_error("invalid device filter: " + std::string(env));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto plaform_list = sycl::platform::get_platforms();
|
||||||
|
|
||||||
|
for (const auto& platform : plaform_list) {
|
||||||
|
auto devices = platform.get_devices();
|
||||||
|
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
|
||||||
|
return d.is_gpu();
|
||||||
|
});
|
||||||
|
|
||||||
|
if (gpu_dev == devices.end()) {
|
||||||
|
// cout << "platform [" << platform_name
|
||||||
|
// << "] does not contain GPU devices, skipping\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto platform_name = platform.get_info<sycl::info::platform::name>();
|
||||||
|
std::string platform_name_low_case;
|
||||||
|
platform_name_low_case.resize(platform_name.size());
|
||||||
|
|
||||||
|
std::transform(
|
||||||
|
platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
|
||||||
|
|
||||||
|
if (platform_name_low_case.find(filter) == std::string::npos) {
|
||||||
|
// cout << "platform [" << platform_name
|
||||||
|
// << "] does not match with requested "
|
||||||
|
// << filter << ", skipping\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
result = platform_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.empty())
|
||||||
|
throw std::runtime_error("can not find preferred GPU platform");
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class DeviceSelector>
|
template <class DeviceSelector>
|
||||||
@ -930,10 +995,15 @@ namespace dpct
|
|||||||
// Keep track of the number of devices per backend
|
// Keep track of the number of devices per backend
|
||||||
std::map<sycl::backend, size_t> DeviceNums;
|
std::map<sycl::backend, size_t> DeviceNums;
|
||||||
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
||||||
|
auto preferred_platform_name = get_preferred_gpu_platform_name();
|
||||||
|
|
||||||
while (!Platforms.empty()) {
|
while (!Platforms.empty()) {
|
||||||
auto Platform = Platforms.back();
|
auto Platform = Platforms.back();
|
||||||
Platforms.pop_back();
|
Platforms.pop_back();
|
||||||
|
auto platform_name = Platform.get_info<sycl::info::platform::name>();
|
||||||
|
if (platform_name.compare(preferred_platform_name) != 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
auto devices = Platform.get_devices();
|
auto devices = Platform.get_devices();
|
||||||
std::string backend_type = get_device_backend_and_type(devices[0]);
|
std::string backend_type = get_device_backend_and_type(devices[0]);
|
||||||
for (const auto &device : devices) {
|
for (const auto &device : devices) {
|
||||||
@ -1989,6 +2059,11 @@ namespace dpct
|
|||||||
return dev_mgr::instance().current_device();
|
return dev_mgr::instance().current_device();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline device_ext &get_device(unsigned int id)
|
||||||
|
{
|
||||||
|
return dev_mgr::instance().get_device(id);
|
||||||
|
}
|
||||||
|
|
||||||
static inline sycl::queue &get_in_order_queue()
|
static inline sycl::queue &get_in_order_queue()
|
||||||
{
|
{
|
||||||
return dev_mgr::instance().current_device().in_order_queue();
|
return dev_mgr::instance().current_device().in_order_queue();
|
||||||
|
@ -16643,9 +16643,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
|
||||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user