From 3420909dffa50e70660524797a1e715a717684d2 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 1 Dec 2024 16:12:41 +0100 Subject: [PATCH] ggml : automatic selection of best CPU backend (#10606) * ggml : automatic selection of best CPU backend * amx : minor opt * add GGML_AVX_VNNI to enable avx-vnni, fix checks --- .devops/llama-server.Dockerfile | 20 +- CMakeLists.txt | 4 - Package.swift | 2 +- ggml/CMakeLists.txt | 1 + ggml/src/ggml-backend-impl.h | 58 ++++-- ggml/src/ggml-backend-reg.cpp | 270 ++++++++++++++++-------- ggml/src/ggml-cpu/CMakeLists.txt | 13 ++ ggml/src/ggml-cpu/amx/common.h | 1 - ggml/src/ggml-cpu/amx/mmq.cpp | 74 ++++--- ggml/src/ggml-cpu/cpu-feats-x86.cpp | 298 +++++++++++++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +- scripts/build-cpu.sh | 12 ++ 12 files changed, 599 insertions(+), 156 deletions(-) create mode 100644 ggml/src/ggml-cpu/cpu-feats-x86.cpp create mode 100755 scripts/build-cpu.sh diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 02accc85e..7110dda9e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -ENV LLAMA_CURL=1 -RUN make -j$(nproc) llama-server +RUN \ + # Build multiple versions of the CPU backend + scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \ + scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \ + scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ + scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ + # Build llama-server + cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build --target llama-server -j $(nproc) && \ + # Copy the built libraries to /app/lib + mkdir -p /app/lib && \ + mv libggml-cpu* /app/lib/ && \ + find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/llama-server /llama-server +COPY --from=build /app/build/bin/llama-server /llama-server +COPY --from=build /app/lib/ / ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d389dccb..f84fff9e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE) set(GGML_LLAMAFILE_DEFAULT ON) endif() -if (NOT DEFINED GGML_AMX) - set(GGML_AMX ON) -endif() - if (NOT DEFINED GGML_CUDA_GRAPHS) set(GGML_CUDA_GRAPHS_DEFAULT ON) endif() diff --git a/Package.swift b/Package.swift index 1e75aa7e2..d32b74a63 100644 --- a/Package.swift +++ b/Package.swift @@ -88,5 +88,5 @@ let package = Package( linkerSettings: linkerSettings ) ], - cxxLanguageStandard: .cxx11 + cxxLanguageStandard: .cxx17 ) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 789fa3b0c..06d371e09 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX512 "ggml: enable AVX512" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index dff7749b4..36d72e95f 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -211,27 +211,45 @@ extern "C" { GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); // Add backend dynamic loading support to the backend - typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); - #ifdef GGML_BACKEND_DL - #ifdef __cplusplus - # define GGML_BACKEND_DL_IMPL(reg_fn) \ - extern "C" { \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ - } \ - ggml_backend_reg_t ggml_backend_init(void) { \ - return reg_fn(); \ - } - #else - # define GGML_BACKEND_DL_IMPL(reg_fn) \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ - ggml_backend_reg_t ggml_backend_init(void) { \ - return reg_fn(); \ - } - #endif - #else - # define GGML_BACKEND_DL_IMPL(reg_fn) - #endif + // Initialize the backend + typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + // Optional: obtain a score for the backend based on the system configuration + // Higher scores are preferred, 0 means the backend is not supported in the current system + typedef int (*ggml_backend_score_t)(void); + +#ifdef GGML_BACKEND_DL +# ifdef __cplusplus +# define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + } \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } +# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \ + extern "C" { \ + GGML_BACKEND_API int ggml_backend_score(void); \ + } \ + int ggml_backend_score(void) { \ + return score_fn(); \ + } +# else +# define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } +# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \ + GGML_BACKEND_API int ggml_backend_score(void); \ + int ggml_backend_score(void) { \ + return score_fn(); \ + } +# endif +#else +# define GGML_BACKEND_DL_IMPL(reg_fn) +# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) +#endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 3182b84f5..2c4bf11b0 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -2,8 +2,13 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include +#include #include +#include +#include +#include #include +#include #include #ifdef _WIN32 @@ -57,9 +62,71 @@ #include "ggml-kompute.h" #endif +#ifdef _WIN32 + +using dl_handle = std::remove_pointer_t; + +struct dl_handle_deleter { + void operator()(HMODULE handle) { + FreeLibrary(handle); + } +}; + +static dl_handle * dl_load_library(const std::wstring & path) { + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryW(path.c_str()); + + SetErrorMode(old_mode); + + return handle; +} + +static dl_handle * dl_load_library(const std::string & path) { + std::wstring_convert> converter; + return dl_load_library(converter.from_bytes(path)); +} + +static void * dl_get_sym(dl_handle * handle, const char * name) { + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, name); + + SetErrorMode(old_mode); + + return p; +} + +#else + +using dl_handle = void; + +struct dl_handle_deleter { + void operator()(void * handle) { + dlclose(handle); + } +}; + +static void * dl_load_library(const std::string & path) { + dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); + + return handle; +} + +static void * dl_get_sym(dl_handle * handle, const char * name) { + return dlsym(handle, name); +} + +#endif + +using dl_handle_ptr = std::unique_ptr; + struct ggml_backend_reg_entry { ggml_backend_reg_t reg; - void * handle; + dl_handle_ptr handle; }; struct ggml_backend_registry { @@ -97,13 +164,16 @@ struct ggml_backend_registry { } ~ggml_backend_registry() { - while (!backends.empty()) { - // use silent since the log system may have been destroyed at this point - unload_backend(backends.back().reg, true); + // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources, + // since backend threads may still be running and accessing resources from the dynamic library + for (auto & entry : backends) { + if (entry.handle) { + entry.handle.release(); // NOLINT + } } } - void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { + void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) { if (!reg) { return; } @@ -112,7 +182,7 @@ struct ggml_backend_registry { GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - backends.push_back({ reg, handle }); + backends.push_back({ reg, std::move(handle) }); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } @@ -126,79 +196,53 @@ struct ggml_backend_registry { } ggml_backend_reg_t load_backend(const char * path, bool silent) { -#ifdef _WIN32 - // suppress error dialogs for missing DLLs - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - HMODULE handle = LoadLibraryA(path); - + dl_handle_ptr handle { dl_load_library(path) }; if (!handle) { if (!silent) { - GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path); } - SetErrorMode(old_mode); return nullptr; } - ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); - - SetErrorMode(old_mode); - - if (!backend_init) { + auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); + if (score_fn && score_fn() == 0) { if (!silent) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path); } - FreeLibrary(handle); return nullptr; } -#else - void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); - if (!handle) { + auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); + if (!backend_init_fn) { if (!silent) { - GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path); } return nullptr; } - auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); - - if (!backend_init) { - if (!silent) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); - } - dlclose(handle); - return nullptr; - } -#endif - ggml_backend_reg_t reg = backend_init(); - + ggml_backend_reg_t reg = backend_init_fn(); if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!silent) { if (!reg) { GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); } else { GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", - __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); } } -#ifdef _WIN32 - FreeLibrary(handle); -#else - dlclose(handle); -#endif return nullptr; } GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); - register_backend(reg, handle); + + register_backend(reg, std::move(handle)); + return reg; } void unload_backend(ggml_backend_reg_t reg, bool silent) { auto it = std::find_if(backends.begin(), backends.end(), - [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; }); if (it == backends.end()) { if (!silent) { @@ -217,15 +261,6 @@ struct ggml_backend_registry { [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), devices.end()); - // unload library - if (it->handle) { -#ifdef _WIN32 - FreeLibrary((HMODULE) it->handle); -#else - dlclose(it->handle); -#endif - } - // remove backend backends.erase(it); } @@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) { get_reg().unload_backend(reg, true); } -void ggml_backend_load_all() { - std::vector search_prefix; - - // add the executable directory to the search path - // FIXME: this is convenient for development, but it should probably be disabled in production - +static std::string get_executable_path() { #if defined(__APPLE__) // get executable path std::vector path; @@ -364,7 +394,7 @@ void ggml_backend_load_all() { if (last_slash != std::string::npos) { base_path = base_path.substr(0, last_slash); } - search_prefix.push_back(base_path + "/"); + return base_path + "/"; #elif defined(__linux__) std::string base_path = "."; std::vector path(1024); @@ -386,38 +416,104 @@ void ggml_backend_load_all() { path.resize(path.size() * 2); } - search_prefix.push_back(base_path + "/"); + return base_path + "/"; +#elif defined(_WIN32) + std::vector path(MAX_PATH); + DWORD len = GetModuleFileNameA(NULL, path.data(), path.size()); + if (len == 0) { + return ""; + } + std::string base_path(path.data(), len); + // remove executable name + auto last_slash = base_path.find_last_of('\\'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + return base_path + "\\"; #endif +} - auto & reg = get_reg(); - - auto try_load = [&](const std::string & name) { - std::string os_name; +static std::string backend_filename_prefix() { #ifdef _WIN32 - os_name = "ggml-" + name + ".dll"; + return "ggml-"; #else - os_name = "libggml-" + name + ".so"; + return "libggml-"; #endif - if (reg.load_backend(os_name.c_str(), true)) { - return; +} + +static std::string backend_filename_suffix() { +#ifdef _WIN32 + return ".dll"; +#else + return ".so"; +#endif +} + +static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { + // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths + // TODO: search system paths + std::vector search_paths = { "./", get_executable_path() }; + std::string file_prefix = backend_filename_prefix() + name + "-"; + + int best_score = 0; + std::string best_path; + + namespace fs = std::filesystem; + for (const auto & search_path : search_paths) { + if (!fs::exists(search_path)) { + continue; } - for (const auto & prefix : search_prefix) { - if (reg.load_backend((prefix + os_name).c_str(), true)) { - return; + for (const auto & entry : fs::directory_iterator(search_path)) { + if (entry.is_regular_file()) { + std::string filename = entry.path().filename().string(); + std::string ext = entry.path().extension().string(); + if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { + dl_handle_ptr handle { dl_load_library(entry.path().c_str()) }; + if (!handle && !silent) { + GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str()); + } + if (handle) { + auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); + if (score_fn) { + int s = score_fn(); +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s); +#endif + if (s > best_score) { + best_score = s; + best_path = entry.path().string(); + } + } + } + } } } - }; + } - try_load("amx"); - try_load("blas"); - try_load("cann"); - try_load("cuda"); - try_load("hip"); - try_load("kompute"); - try_load("metal"); - try_load("rpc"); - try_load("sycl"); - try_load("vulkan"); - try_load("musa"); - try_load("cpu"); + if (best_score == 0) { + // try to load the base backend + for (const auto & search_path : search_paths) { + std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix(); + if (fs::exists(path)) { + return get_reg().load_backend(path.c_str(), silent); + } + } + return nullptr; + } + + return get_reg().load_backend(best_path.c_str(), silent); +} + +void ggml_backend_load_all() { + ggml_backend_load_best("blas", true); + ggml_backend_load_best("cann", true); + ggml_backend_load_best("cuda", true); + ggml_backend_load_best("hip", true); + ggml_backend_load_best("kompute", true); + ggml_backend_load_best("metal", true); + ggml_backend_load_best("rpc", true); + ggml_backend_load_best("sycl", true); + ggml_backend_load_best("vulkan", true); + ggml_backend_load_best("musa", true); + ggml_backend_load_best("cpu", true); } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index fe2222084..5df63884c 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_DEFINITIONS __AVXVNNI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavxvnni) + endif() + endif() else() if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) @@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (GGML_AVX2) list(APPEND ARCH_FLAGS -mavx2) endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + endif() if (GGML_AVX512) list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512dq) @@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") +# the feature detection code must be compiled without any architecture flags +target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) +# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection + if (EMSCRIPTEN) set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") endif() diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h index 0b0657289..40074c3fc 100644 --- a/ggml/src/ggml-cpu/amx/common.h +++ b/ggml/src/ggml-cpu/amx/common.h @@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f int tbegin, tend; balance211(n, params->nth, params->ith, tbegin, tend); f(tbegin, tend); - ggml_barrier(params->threadpool); // TODO: might not always be needed } // quantized types that have AMX support diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 6447e73d0..0ec3aa86d 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx __m512 vb[COLS]; __m512 vc[ROWS * COLS]; - auto loadc = [&](int idx) { + auto loadc = [&](auto idx) { vc[idx] = _mm512_setzero_ps(); }; Unroll{}(loadc); - auto compute = [&](int idx, int k) { - // TODO: use `constexpr` here to get rid of interger div - // when upgraded to C++17 - const int row = idx / COLS; - const int col = idx % COLS; + auto compute = [&](auto idx, auto k) { + constexpr int row = idx / COLS; + constexpr int col = idx % COLS; - if (col == 0) { + if constexpr (col == 0) { va = _mm512_loadu_ps(A + row * K + k); } - if (row == 0) { + if constexpr (row == 0) { vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); } vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); @@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx Unroll{}(compute, k); } - auto storec = [&](int idx) { - const int row = idx / COLS; - const int col = idx % COLS; + auto storec = [&](auto idx) { + constexpr int row = idx / COLS; + constexpr int col = idx % COLS; C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); }; Unroll{}(storec); @@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni{}(loadc); - auto compute = [&](int col, int i) { + auto compute = [&](auto col, auto i) { // load a and compute compensation - if (col == 0) { + if constexpr (col == 0) { const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); vcomp = _mm512_setzero_si512(); for (int k = 0; k < 8; ++k) { @@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni{}(storec); @@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni const __m512i lowMask = _mm512_set1_epi8(0xF); - auto loadc = [&](int col) { + auto loadc = [&](auto col) { vc[col] = _mm512_setzero_ps(); }; Unroll{}(loadc); - auto compute = [&](int col, int i) { + auto compute = [&](auto col, auto i) { // load a - if (col == 0) { + if constexpr (col == 0) { const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); for (int k = 0; k < 8; ++k) { va[k] = _mm512_set1_epi32(a_ptr[k]); @@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni } //store to C - auto storec = [&](int col) { + auto storec = [&](auto col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); }; Unroll{}(storec); @@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni(0x80)); - auto loadc = [&](int col) { + auto loadc = [&](auto col) { vc[col] = _mm512_setzero_ps(); }; Unroll{}(loadc); - auto compute = [&](int col, int i) { + auto compute = [&](auto col, auto i) { // load a and add offset 128 - if (col == 0) { + if constexpr (col == 0) { const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); for (int k = 0; k < 8; ++k) { va[k] = _mm512_set1_epi32(a_ptr[k]); @@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni{}(storec); @@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni{}(loadc); @@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni{}(storec); @@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni{}(loadc); // Q5_K and Q4_K shares the same vnni formats, refer to notes above. - auto compute = [&](int col, int i) { + auto compute = [&](auto col, auto i) { // load a - if (col == 0) { + if constexpr (col == 0) { for (int k_group = 0; k_group < QK_K / 32; ++k_group) { va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); } @@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni{}(storec); @@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni{}(loadc); - auto compute = [&](int col, int i) { - if (col == 0) { + auto compute = [&](auto col, auto i) { + if constexpr (col == 0) { // load a va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); @@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni(0x80)); const __m512i values256 = _mm512_add_epi8(values128, off); - auto loadc = [&](int col) { + auto loadc = [&](auto col) { vc[col] = _mm512_setzero_ps(); }; Unroll{}(loadc); - auto compute = [&](int col, int i) { - if (col == 0) { + auto compute = [&](auto col, auto i) { + if constexpr (col == 0) { // load a va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); @@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni{}(storec); diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp new file mode 100644 index 000000000..514701ffe --- /dev/null +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -0,0 +1,298 @@ +#include "ggml-cpu.h" +#include "ggml-backend-impl.h" + +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) + +#ifdef _MSC_VER +#include +#endif + +#include +#include +#include +#include +#include + +struct cpuid_x86 { + bool SSE3(void) { return f_1_ecx[0]; } + bool PCLMULQDQ(void) { return f_1_ecx[1]; } + bool MONITOR(void) { return f_1_ecx[3]; } + bool SSSE3(void) { return f_1_ecx[9]; } + bool FMA(void) { return f_1_ecx[12]; } + bool CMPXCHG16B(void) { return f_1_ecx[13]; } + bool SSE41(void) { return f_1_ecx[19]; } + bool SSE42(void) { return f_1_ecx[20]; } + bool MOVBE(void) { return f_1_ecx[22]; } + bool POPCNT(void) { return f_1_ecx[23]; } + bool AES(void) { return f_1_ecx[25]; } + bool XSAVE(void) { return f_1_ecx[26]; } + bool OSXSAVE(void) { return f_1_ecx[27]; } + bool AVX(void) { return f_1_ecx[28]; } + bool F16C(void) { return f_1_ecx[29]; } + bool RDRAND(void) { return f_1_ecx[30]; } + + bool MSR(void) { return f_1_edx[5]; } + bool CX8(void) { return f_1_edx[8]; } + bool SEP(void) { return f_1_edx[11]; } + bool CMOV(void) { return f_1_edx[15]; } + bool CLFSH(void) { return f_1_edx[19]; } + bool MMX(void) { return f_1_edx[23]; } + bool FXSR(void) { return f_1_edx[24]; } + bool SSE(void) { return f_1_edx[25]; } + bool SSE2(void) { return f_1_edx[26]; } + + bool FSGSBASE(void) { return f_7_ebx[0]; } + bool BMI1(void) { return f_7_ebx[3]; } + bool HLE(void) { return is_intel && f_7_ebx[4]; } + bool AVX2(void) { return f_7_ebx[5]; } + bool BMI2(void) { return f_7_ebx[8]; } + bool ERMS(void) { return f_7_ebx[9]; } + bool INVPCID(void) { return f_7_ebx[10]; } + bool RTM(void) { return is_intel && f_7_ebx[11]; } + bool AVX512F(void) { return f_7_ebx[16]; } + bool RDSEED(void) { return f_7_ebx[18]; } + bool ADX(void) { return f_7_ebx[19]; } + bool AVX512PF(void) { return f_7_ebx[26]; } + bool AVX512ER(void) { return f_7_ebx[27]; } + bool AVX512CD(void) { return f_7_ebx[28]; } + bool SHA(void) { return f_7_ebx[29]; } + + bool PREFETCHWT1(void) { return f_7_ecx[0]; } + + bool LAHF(void) { return f_81_ecx[0]; } + bool LZCNT(void) { return is_intel && f_81_ecx[5]; } + bool ABM(void) { return is_amd && f_81_ecx[5]; } + bool SSE4a(void) { return is_amd && f_81_ecx[6]; } + bool XOP(void) { return is_amd && f_81_ecx[11]; } + bool TBM(void) { return is_amd && f_81_ecx[21]; } + + bool SYSCALL(void) { return is_intel && f_81_edx[11]; } + bool MMXEXT(void) { return is_amd && f_81_edx[22]; } + bool RDTSCP(void) { return is_intel && f_81_edx[27]; } + bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; } + bool _3DNOW(void) { return is_amd && f_81_edx[31]; } + + bool AVX512_VBMI(void) { return f_7_ecx[1]; } + bool AVX512_VNNI(void) { return f_7_ecx[11]; } + bool AVX512_FP16(void) { return f_7_edx[23]; } + bool AVX512_BF16(void) { return f_7_1_eax[5]; } + bool AVX_VNNI(void) { return f_7_1_eax[4]; } + + bool AMX_TILE(void) { return f_7_edx[24]; } + bool AMX_INT8(void) { return f_7_edx[25]; } + bool AMX_FP16(void) { return f_7_1_eax[21]; } + bool AMX_BF16(void) { return f_7_edx[22]; } + +#ifdef _MSC_VER + static void cpuid(int cpu_info[4], int eax) { + __cpuid(cpu_info, eax); + } + static void cpuidex(int cpu_info[4], int eax, int ecx) { + __cpuidex(cpu_info, eax, ecx); + } +#else + static void cpuid(int cpu_info[4], int eax) { + __asm__ __volatile__( + "cpuid" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(eax), "c"(0)); + } + static void cpuidex(int cpu_info[4], int eax, int ecx) { + __asm__ __volatile__( + "cpuid" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(eax), "c"(ecx)); + } +#endif + + cpuid_x86() { + std::array cpui; + std::vector> data; + + // calling __cpuid with 0x0 as the function_id argument + // gets the number of the highest valid function ID. + cpuid(cpui.data(), 0); + int n_ids = cpui[0]; + + for (int i = 0; i <= n_ids; ++i) { + cpuidex(cpui.data(), i, 0); + data.push_back(cpui); + } + + // capture vendor string + char vendor[0x20] = {}; + *reinterpret_cast(vendor) = data[0][1]; + *reinterpret_cast(vendor + 4) = data[0][3]; + *reinterpret_cast(vendor + 8) = data[0][2]; + this->vendor = vendor; + if (this->vendor == "GenuineIntel") { + is_intel = true; + } else if (this->vendor == "AuthenticAMD") { + is_amd = true; + } + + // load bitset with flags for function 0x00000001 + if (n_ids >= 1) { + f_1_ecx = data[1][2]; + f_1_edx = data[1][3]; + } + + // load bitset with flags for function 0x00000007 + if (n_ids >= 7) { + f_7_ebx = data[7][1]; + f_7_ecx = data[7][2]; + f_7_edx = data[7][3]; + cpuidex(cpui.data(), 7, 1); + f_7_1_eax = cpui[0]; + } + + // calling __cpuid with 0x80000000 as the function_id argument + // gets the number of the highest valid extended ID. + cpuid(cpui.data(), 0x80000000); + unsigned int n_ex_ids = cpui[0]; + + std::vector> ext_data; + for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) { + cpuidex(cpui.data(), i, 0); + ext_data.push_back(cpui); + } + + // load bitset with flags for function 0x80000001 + if (n_ex_ids >= 0x80000001) { + f_81_ecx = ext_data[1][2]; + f_81_edx = ext_data[1][3]; + } + + // interpret CPU brand string if reported + char brand[0x40] = {}; + if (n_ex_ids >= 0x80000004) { + std::memcpy(brand, ext_data[2].data(), sizeof(cpui)); + std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui)); + std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui)); + this->brand = brand; + } + } + + bool is_intel = false; + bool is_amd = false; + std::string vendor; + std::string brand; + std::bitset<32> f_1_ecx; + std::bitset<32> f_1_edx; + std::bitset<32> f_7_ebx; + std::bitset<32> f_7_ecx; + std::bitset<32> f_7_edx; + std::bitset<32> f_7_1_eax; + std::bitset<32> f_81_ecx; + std::bitset<32> f_81_edx; +}; + +#if 0 +void test_x86_is() { + cpuid_x86 is; + printf("CPU Vendor: %s\n", is.vendor.c_str()); + printf("Brand: %s\n", is.brand.c_str()); + printf("is_intel: %d\n", is.is_intel); + printf("is_amd: %d\n", is.is_amd); + printf("sse3: %d\n", is.SSE3()); + printf("pclmulqdq: %d\n", is.PCLMULQDQ()); + printf("ssse3: %d\n", is.SSSE3()); + printf("fma: %d\n", is.FMA()); + printf("cmpxchg16b: %d\n", is.CMPXCHG16B()); + printf("sse41: %d\n", is.SSE41()); + printf("sse42: %d\n", is.SSE42()); + printf("movbe: %d\n", is.MOVBE()); + printf("popcnt: %d\n", is.POPCNT()); + printf("aes: %d\n", is.AES()); + printf("xsave: %d\n", is.XSAVE()); + printf("osxsave: %d\n", is.OSXSAVE()); + printf("avx: %d\n", is.AVX()); + printf("f16c: %d\n", is.F16C()); + printf("rdrand: %d\n", is.RDRAND()); + printf("msr: %d\n", is.MSR()); + printf("cx8: %d\n", is.CX8()); + printf("sep: %d\n", is.SEP()); + printf("cmov: %d\n", is.CMOV()); + printf("clflush: %d\n", is.CLFSH()); + printf("mmx: %d\n", is.MMX()); + printf("fxsr: %d\n", is.FXSR()); + printf("sse: %d\n", is.SSE()); + printf("sse2: %d\n", is.SSE2()); + printf("fsgsbase: %d\n", is.FSGSBASE()); + printf("bmi1: %d\n", is.BMI1()); + printf("hle: %d\n", is.HLE()); + printf("avx2: %d\n", is.AVX2()); + printf("bmi2: %d\n", is.BMI2()); + printf("erms: %d\n", is.ERMS()); + printf("invpcid: %d\n", is.INVPCID()); + printf("rtm: %d\n", is.RTM()); + printf("avx512f: %d\n", is.AVX512F()); + printf("rdseed: %d\n", is.RDSEED()); + printf("adx: %d\n", is.ADX()); + printf("avx512pf: %d\n", is.AVX512PF()); + printf("avx512er: %d\n", is.AVX512ER()); + printf("avx512cd: %d\n", is.AVX512CD()); + printf("sha: %d\n", is.SHA()); + printf("prefetchwt1: %d\n", is.PREFETCHWT1()); + printf("lahf: %d\n", is.LAHF()); + printf("lzcnt: %d\n", is.LZCNT()); + printf("abm: %d\n", is.ABM()); + printf("sse4a: %d\n", is.SSE4a()); + printf("xop: %d\n", is.XOP()); + printf("tbm: %d\n", is.TBM()); + printf("syscall: %d\n", is.SYSCALL()); + printf("mmxext: %d\n", is.MMXEXT()); + printf("rdtscp: %d\n", is.RDTSCP()); + printf("3dnowext: %d\n", is._3DNOWEXT()); + printf("3dnow: %d\n", is._3DNOW()); + printf("avx512_vbmi: %d\n", is.AVX512_VBMI()); + printf("avx512_vnni: %d\n", is.AVX512_VNNI()); + printf("avx512_fp16: %d\n", is.AVX512_FP16()); + printf("avx512_bf16: %d\n", is.AVX512_BF16()); + printf("amx_tile: %d\n", is.AMX_TILE()); + printf("amx_int8: %d\n", is.AMX_INT8()); + printf("amx_fp16: %d\n", is.AMX_FP16()); + printf("amx_bf16: %d\n", is.AMX_BF16()); +} +#endif + +static int ggml_backend_cpu_x86_score() { + // FIXME: this does not check for OS support + + cpuid_x86 is; + // if the CPU backend was built with any features not supported by the current CPU, it cannot be used + if (ggml_cpu_has_fma() && !is.FMA()) { return 0; } + if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; } + if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; } + if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; } + if (ggml_cpu_has_avx() && !is.AVX()) { return 0; } + if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; } + if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; } + if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; } + if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; } + if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; } + if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; } + if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; } + + // calculate a backend score based on the supported features + // more important features have a higher weight + int score = 0; + score += ggml_cpu_has_fma () * 1; + score += ggml_cpu_has_f16c () * 1<<1; + score += ggml_cpu_has_ssse3 () * 1<<2; + score += ggml_cpu_has_sse3 () * 1<<3; + score += ggml_cpu_has_avx_vnni () * 1<<4; + score += ggml_cpu_has_avx () * 1<<5; + score += ggml_cpu_has_avx2 () * 1<<6; + score += ggml_cpu_has_avx512 () * 1<<7; + // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used + score += ggml_cpu_has_avx512_bf16() * 1<<9; + score += ggml_cpu_has_avx512_vnni() * 1<<10; + score += ggml_cpu_has_amx_int8 () * 1<<11; + + return score; +} + +GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score) + +#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 61a92cfd9..11152385e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) { } static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { -#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) +#if defined(__AVX512VNNI__) const __m512i zero = _mm512_setzero_si512(); return _mm512_dpbusd_epi32(zero, ax, sy); #else diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh new file mode 100755 index 000000000..4b2ad816e --- /dev/null +++ b/scripts/build-cpu.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +name="$1" +args="${@:2}" + +echo "Building $name with args: $args" + +rm -fr build-cpu-$1 +cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args +cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc) +cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so +rm -fr build-cpu-$1