mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 13:27:21 +01:00
ggml : automatic selection of best CPU backend (#10606)
* ggml : automatic selection of best CPU backend * amx : minor opt * add GGML_AVX_VNNI to enable avx-vnni, fix checks
This commit is contained in:
parent
86dc11c5bc
commit
3420909dff
@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04
|
|||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN \
|
||||||
|
# Build multiple versions of the CPU backend
|
||||||
|
scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \
|
||||||
|
scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \
|
||||||
|
scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
|
||||||
|
scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
|
||||||
|
# Build llama-server
|
||||||
|
cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build --target llama-server -j $(nproc) && \
|
||||||
|
# Copy the built libraries to /app/lib
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
mv libggml-cpu* /app/lib/ && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib/ \;
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
COPY --from=build /app/lib/ /
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
|
|||||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_AMX)
|
|
||||||
set(GGML_AMX ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||||
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
@ -88,5 +88,5 @@ let package = Package(
|
|||||||
linkerSettings: linkerSettings
|
linkerSettings: linkerSettings
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
cxxLanguageStandard: .cxx11
|
cxxLanguageStandard: .cxx17
|
||||||
)
|
)
|
||||||
|
@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|||||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||||
|
|
||||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||||
|
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||||
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
||||||
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||||
|
@ -211,27 +211,45 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||||
|
|
||||||
// Add backend dynamic loading support to the backend
|
// Add backend dynamic loading support to the backend
|
||||||
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
|
||||||
|
|
||||||
#ifdef GGML_BACKEND_DL
|
// Initialize the backend
|
||||||
#ifdef __cplusplus
|
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
||||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
// Optional: obtain a score for the backend based on the system configuration
|
||||||
extern "C" { \
|
// Higher scores are preferred, 0 means the backend is not supported in the current system
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
typedef int (*ggml_backend_score_t)(void);
|
||||||
} \
|
|
||||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
#ifdef GGML_BACKEND_DL
|
||||||
return reg_fn(); \
|
# ifdef __cplusplus
|
||||||
}
|
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||||
#else
|
extern "C" { \
|
||||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
} \
|
||||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||||
return reg_fn(); \
|
return reg_fn(); \
|
||||||
}
|
}
|
||||||
#endif
|
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
||||||
#else
|
extern "C" { \
|
||||||
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
GGML_BACKEND_API int ggml_backend_score(void); \
|
||||||
#endif
|
} \
|
||||||
|
int ggml_backend_score(void) { \
|
||||||
|
return score_fn(); \
|
||||||
|
}
|
||||||
|
# else
|
||||||
|
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||||
|
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||||
|
return reg_fn(); \
|
||||||
|
}
|
||||||
|
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
||||||
|
GGML_BACKEND_API int ggml_backend_score(void); \
|
||||||
|
int ggml_backend_score(void) { \
|
||||||
|
return score_fn(); \
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
||||||
|
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,13 @@
|
|||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <locale>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <type_traits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -57,9 +62,71 @@
|
|||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
|
||||||
|
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||||
|
|
||||||
|
struct dl_handle_deleter {
|
||||||
|
void operator()(HMODULE handle) {
|
||||||
|
FreeLibrary(handle);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static dl_handle * dl_load_library(const std::wstring & path) {
|
||||||
|
// suppress error dialogs for missing DLLs
|
||||||
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||||
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||||
|
|
||||||
|
HMODULE handle = LoadLibraryW(path.c_str());
|
||||||
|
|
||||||
|
SetErrorMode(old_mode);
|
||||||
|
|
||||||
|
return handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
static dl_handle * dl_load_library(const std::string & path) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return dl_load_library(converter.from_bytes(path));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||||
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||||
|
|
||||||
|
void * p = (void *) GetProcAddress(handle, name);
|
||||||
|
|
||||||
|
SetErrorMode(old_mode);
|
||||||
|
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
using dl_handle = void;
|
||||||
|
|
||||||
|
struct dl_handle_deleter {
|
||||||
|
void operator()(void * handle) {
|
||||||
|
dlclose(handle);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void * dl_load_library(const std::string & path) {
|
||||||
|
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||||
|
|
||||||
|
return handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
|
return dlsym(handle, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||||
|
|
||||||
struct ggml_backend_reg_entry {
|
struct ggml_backend_reg_entry {
|
||||||
ggml_backend_reg_t reg;
|
ggml_backend_reg_t reg;
|
||||||
void * handle;
|
dl_handle_ptr handle;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_registry {
|
struct ggml_backend_registry {
|
||||||
@ -97,13 +164,16 @@ struct ggml_backend_registry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
~ggml_backend_registry() {
|
~ggml_backend_registry() {
|
||||||
while (!backends.empty()) {
|
// FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
|
||||||
// use silent since the log system may have been destroyed at this point
|
// since backend threads may still be running and accessing resources from the dynamic library
|
||||||
unload_backend(backends.back().reg, true);
|
for (auto & entry : backends) {
|
||||||
|
if (entry.handle) {
|
||||||
|
entry.handle.release(); // NOLINT
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
|
void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
|
||||||
if (!reg) {
|
if (!reg) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -112,7 +182,7 @@ struct ggml_backend_registry {
|
|||||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||||
#endif
|
#endif
|
||||||
backends.push_back({ reg, handle });
|
backends.push_back({ reg, std::move(handle) });
|
||||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||||
}
|
}
|
||||||
@ -126,79 +196,53 @@ struct ggml_backend_registry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
||||||
#ifdef _WIN32
|
dl_handle_ptr handle { dl_load_library(path) };
|
||||||
// suppress error dialogs for missing DLLs
|
|
||||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
|
||||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
|
||||||
|
|
||||||
HMODULE handle = LoadLibraryA(path);
|
|
||||||
|
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
||||||
}
|
}
|
||||||
SetErrorMode(old_mode);
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
|
if (score_fn && score_fn() == 0) {
|
||||||
SetErrorMode(old_mode);
|
|
||||||
|
|
||||||
if (!backend_init) {
|
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
|
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
||||||
}
|
}
|
||||||
FreeLibrary(handle);
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
|
|
||||||
|
|
||||||
if (!handle) {
|
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||||
|
if (!backend_init_fn) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
|
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
|
ggml_backend_reg_t reg = backend_init_fn();
|
||||||
|
|
||||||
if (!backend_init) {
|
|
||||||
if (!silent) {
|
|
||||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
|
|
||||||
}
|
|
||||||
dlclose(handle);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
ggml_backend_reg_t reg = backend_init();
|
|
||||||
|
|
||||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
if (!reg) {
|
if (!reg) {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
||||||
} else {
|
} else {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef _WIN32
|
|
||||||
FreeLibrary(handle);
|
|
||||||
#else
|
|
||||||
dlclose(handle);
|
|
||||||
#endif
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||||
register_backend(reg, handle);
|
|
||||||
|
register_backend(reg, std::move(handle));
|
||||||
|
|
||||||
return reg;
|
return reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
||||||
auto it = std::find_if(backends.begin(), backends.end(),
|
auto it = std::find_if(backends.begin(), backends.end(),
|
||||||
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
|
[reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
|
||||||
|
|
||||||
if (it == backends.end()) {
|
if (it == backends.end()) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
@ -217,15 +261,6 @@ struct ggml_backend_registry {
|
|||||||
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
||||||
devices.end());
|
devices.end());
|
||||||
|
|
||||||
// unload library
|
|
||||||
if (it->handle) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
FreeLibrary((HMODULE) it->handle);
|
|
||||||
#else
|
|
||||||
dlclose(it->handle);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove backend
|
// remove backend
|
||||||
backends.erase(it);
|
backends.erase(it);
|
||||||
}
|
}
|
||||||
@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
|
|||||||
get_reg().unload_backend(reg, true);
|
get_reg().unload_backend(reg, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_load_all() {
|
static std::string get_executable_path() {
|
||||||
std::vector<std::string> search_prefix;
|
|
||||||
|
|
||||||
// add the executable directory to the search path
|
|
||||||
// FIXME: this is convenient for development, but it should probably be disabled in production
|
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
// get executable path
|
// get executable path
|
||||||
std::vector<char> path;
|
std::vector<char> path;
|
||||||
@ -364,7 +394,7 @@ void ggml_backend_load_all() {
|
|||||||
if (last_slash != std::string::npos) {
|
if (last_slash != std::string::npos) {
|
||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
search_prefix.push_back(base_path + "/");
|
return base_path + "/";
|
||||||
#elif defined(__linux__)
|
#elif defined(__linux__)
|
||||||
std::string base_path = ".";
|
std::string base_path = ".";
|
||||||
std::vector<char> path(1024);
|
std::vector<char> path(1024);
|
||||||
@ -386,38 +416,104 @@ void ggml_backend_load_all() {
|
|||||||
path.resize(path.size() * 2);
|
path.resize(path.size() * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
search_prefix.push_back(base_path + "/");
|
return base_path + "/";
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
std::vector<char> path(MAX_PATH);
|
||||||
|
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
||||||
|
if (len == 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
std::string base_path(path.data(), len);
|
||||||
|
// remove executable name
|
||||||
|
auto last_slash = base_path.find_last_of('\\');
|
||||||
|
if (last_slash != std::string::npos) {
|
||||||
|
base_path = base_path.substr(0, last_slash);
|
||||||
|
}
|
||||||
|
return base_path + "\\";
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
auto & reg = get_reg();
|
static std::string backend_filename_prefix() {
|
||||||
|
|
||||||
auto try_load = [&](const std::string & name) {
|
|
||||||
std::string os_name;
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
os_name = "ggml-" + name + ".dll";
|
return "ggml-";
|
||||||
#else
|
#else
|
||||||
os_name = "libggml-" + name + ".so";
|
return "libggml-";
|
||||||
#endif
|
#endif
|
||||||
if (reg.load_backend(os_name.c_str(), true)) {
|
}
|
||||||
return;
|
|
||||||
|
static std::string backend_filename_suffix() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
return ".dll";
|
||||||
|
#else
|
||||||
|
return ".so";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
|
||||||
|
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||||
|
// TODO: search system paths
|
||||||
|
std::vector<std::string> search_paths = { "./", get_executable_path() };
|
||||||
|
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||||
|
|
||||||
|
int best_score = 0;
|
||||||
|
std::string best_path;
|
||||||
|
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
for (const auto & search_path : search_paths) {
|
||||||
|
if (!fs::exists(search_path)) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
for (const auto & prefix : search_prefix) {
|
for (const auto & entry : fs::directory_iterator(search_path)) {
|
||||||
if (reg.load_backend((prefix + os_name).c_str(), true)) {
|
if (entry.is_regular_file()) {
|
||||||
return;
|
std::string filename = entry.path().filename().string();
|
||||||
|
std::string ext = entry.path().extension().string();
|
||||||
|
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||||
|
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
||||||
|
if (!handle && !silent) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||||
|
}
|
||||||
|
if (handle) {
|
||||||
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
|
if (score_fn) {
|
||||||
|
int s = score_fn();
|
||||||
|
#ifndef NDEBUG
|
||||||
|
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||||
|
#endif
|
||||||
|
if (s > best_score) {
|
||||||
|
best_score = s;
|
||||||
|
best_path = entry.path().string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
try_load("amx");
|
if (best_score == 0) {
|
||||||
try_load("blas");
|
// try to load the base backend
|
||||||
try_load("cann");
|
for (const auto & search_path : search_paths) {
|
||||||
try_load("cuda");
|
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
||||||
try_load("hip");
|
if (fs::exists(path)) {
|
||||||
try_load("kompute");
|
return get_reg().load_backend(path.c_str(), silent);
|
||||||
try_load("metal");
|
}
|
||||||
try_load("rpc");
|
}
|
||||||
try_load("sycl");
|
return nullptr;
|
||||||
try_load("vulkan");
|
}
|
||||||
try_load("musa");
|
|
||||||
try_load("cpu");
|
return get_reg().load_backend(best_path.c_str(), silent);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_load_all() {
|
||||||
|
ggml_backend_load_best("blas", true);
|
||||||
|
ggml_backend_load_best("cann", true);
|
||||||
|
ggml_backend_load_best("cuda", true);
|
||||||
|
ggml_backend_load_best("hip", true);
|
||||||
|
ggml_backend_load_best("kompute", true);
|
||||||
|
ggml_backend_load_best("metal", true);
|
||||||
|
ggml_backend_load_best("rpc", true);
|
||||||
|
ggml_backend_load_best("sycl", true);
|
||||||
|
ggml_backend_load_best("vulkan", true);
|
||||||
|
ggml_backend_load_best("musa", true);
|
||||||
|
ggml_backend_load_best("cpu", true);
|
||||||
}
|
}
|
||||||
|
@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||||||
elseif (GGML_AVX)
|
elseif (GGML_AVX)
|
||||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||||
endif()
|
endif()
|
||||||
|
if (GGML_AVX_VNNI)
|
||||||
|
list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
|
||||||
|
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
|
list(APPEND ARCH_FLAGS -mavxvnni)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
if (GGML_NATIVE)
|
if (GGML_NATIVE)
|
||||||
list(APPEND ARCH_FLAGS -march=native)
|
list(APPEND ARCH_FLAGS -march=native)
|
||||||
@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||||||
if (GGML_AVX2)
|
if (GGML_AVX2)
|
||||||
list(APPEND ARCH_FLAGS -mavx2)
|
list(APPEND ARCH_FLAGS -mavx2)
|
||||||
endif()
|
endif()
|
||||||
|
if (GGML_AVX_VNNI)
|
||||||
|
list(APPEND ARCH_FLAGS -mavxvnni)
|
||||||
|
endif()
|
||||||
if (GGML_AVX512)
|
if (GGML_AVX512)
|
||||||
list(APPEND ARCH_FLAGS -mavx512f)
|
list(APPEND ARCH_FLAGS -mavx512f)
|
||||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
list(APPEND ARCH_FLAGS -mavx512dq)
|
||||||
@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
|
|||||||
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
|
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
|
||||||
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
|
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
|
||||||
|
|
||||||
|
# the feature detection code must be compiled without any architecture flags
|
||||||
|
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
|
||||||
|
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||||
endif()
|
endif()
|
||||||
|
@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f
|
|||||||
int tbegin, tend;
|
int tbegin, tend;
|
||||||
balance211(n, params->nth, params->ith, tbegin, tend);
|
balance211(n, params->nth, params->ith, tbegin, tend);
|
||||||
f(tbegin, tend);
|
f(tbegin, tend);
|
||||||
ggml_barrier(params->threadpool); // TODO: might not always be needed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// quantized types that have AMX support
|
// quantized types that have AMX support
|
||||||
|
@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
|
|||||||
__m512 vb[COLS];
|
__m512 vb[COLS];
|
||||||
__m512 vc[ROWS * COLS];
|
__m512 vc[ROWS * COLS];
|
||||||
|
|
||||||
auto loadc = [&](int idx) {
|
auto loadc = [&](auto idx) {
|
||||||
vc[idx] = _mm512_setzero_ps();
|
vc[idx] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<ROWS * COLS>{}(loadc);
|
Unroll<ROWS * COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int idx, int k) {
|
auto compute = [&](auto idx, auto k) {
|
||||||
// TODO: use `constexpr` here to get rid of interger div
|
constexpr int row = idx / COLS;
|
||||||
// when upgraded to C++17
|
constexpr int col = idx % COLS;
|
||||||
const int row = idx / COLS;
|
|
||||||
const int col = idx % COLS;
|
|
||||||
|
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
va = _mm512_loadu_ps(A + row * K + k);
|
va = _mm512_loadu_ps(A + row * K + k);
|
||||||
}
|
}
|
||||||
if (row == 0) {
|
if constexpr (row == 0) {
|
||||||
vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
|
vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
|
||||||
}
|
}
|
||||||
vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
|
vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
|
||||||
@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
|
|||||||
Unroll<ROWS * COLS>{}(compute, k);
|
Unroll<ROWS * COLS>{}(compute, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto storec = [&](int idx) {
|
auto storec = [&](auto idx) {
|
||||||
const int row = idx / COLS;
|
constexpr int row = idx / COLS;
|
||||||
const int col = idx % COLS;
|
constexpr int col = idx % COLS;
|
||||||
C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
|
C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
|
||||||
};
|
};
|
||||||
Unroll<ROWS * COLS>{}(storec);
|
Unroll<ROWS * COLS>{}(storec);
|
||||||
@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
const __m512i off = _mm512_set1_epi8(8);
|
const __m512i off = _mm512_set1_epi8(8);
|
||||||
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
// load a and compute compensation
|
// load a and compute compensation
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
||||||
vcomp = _mm512_setzero_si512();
|
vcomp = _mm512_setzero_si512();
|
||||||
for (int k = 0; k < 8; ++k) {
|
for (int k = 0; k < 8; ++k) {
|
||||||
@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|||||||
|
|
||||||
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
// load a
|
// load a
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
||||||
for (int k = 0; k < 8; ++k) {
|
for (int k = 0; k < 8; ++k) {
|
||||||
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
||||||
@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
//
|
//
|
||||||
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
|
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
// load a and add offset 128
|
// load a and add offset 128
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
|
||||||
for (int k = 0; k < 8; ++k) {
|
for (int k = 0; k < 8; ++k) {
|
||||||
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
||||||
@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
|
|
||||||
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
// int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
|
// int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
|
||||||
// from {16, 8} to {4, 32}
|
// from {16, 8} to {4, 32}
|
||||||
//
|
//
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
// load a
|
// load a
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
|
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
|
||||||
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
|
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
|
||||||
}
|
}
|
||||||
@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
|
|
||||||
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
// Q5_K and Q4_K shares the same vnni formats, refer to notes above.
|
// Q5_K and Q4_K shares the same vnni formats, refer to notes above.
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
// load a
|
// load a
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
|
for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
|
||||||
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
|
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
|
||||||
}
|
}
|
||||||
@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
|
|||||||
const __m512i m32s = _mm512_set1_epi32(32);
|
const __m512i m32s = _mm512_set1_epi32(32);
|
||||||
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
const __m512i lowMask = _mm512_set1_epi8(0xF);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
// load a
|
// load a
|
||||||
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
|
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
|
||||||
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
|
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
|
||||||
@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
|
|||||||
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
|
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
|
||||||
const __m512i values256 = _mm512_add_epi8(values128, off);
|
const __m512i values256 = _mm512_add_epi8(values128, off);
|
||||||
|
|
||||||
auto loadc = [&](int col) {
|
auto loadc = [&](auto col) {
|
||||||
vc[col] = _mm512_setzero_ps();
|
vc[col] = _mm512_setzero_ps();
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(loadc);
|
Unroll<COLS>{}(loadc);
|
||||||
|
|
||||||
auto compute = [&](int col, int i) {
|
auto compute = [&](auto col, auto i) {
|
||||||
if (col == 0) {
|
if constexpr (col == 0) {
|
||||||
// load a
|
// load a
|
||||||
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
|
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
|
||||||
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
|
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
|
||||||
@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
|
|||||||
}
|
}
|
||||||
|
|
||||||
//store to C
|
//store to C
|
||||||
auto storec = [&](int col) {
|
auto storec = [&](auto col) {
|
||||||
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
|
||||||
};
|
};
|
||||||
Unroll<COLS>{}(storec);
|
Unroll<COLS>{}(storec);
|
||||||
|
298
ggml/src/ggml-cpu/cpu-feats-x86.cpp
Normal file
298
ggml/src/ggml-cpu/cpu-feats-x86.cpp
Normal file
@ -0,0 +1,298 @@
|
|||||||
|
#include "ggml-cpu.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <bitset>
|
||||||
|
#include <array>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
struct cpuid_x86 {
|
||||||
|
bool SSE3(void) { return f_1_ecx[0]; }
|
||||||
|
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
||||||
|
bool MONITOR(void) { return f_1_ecx[3]; }
|
||||||
|
bool SSSE3(void) { return f_1_ecx[9]; }
|
||||||
|
bool FMA(void) { return f_1_ecx[12]; }
|
||||||
|
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
|
||||||
|
bool SSE41(void) { return f_1_ecx[19]; }
|
||||||
|
bool SSE42(void) { return f_1_ecx[20]; }
|
||||||
|
bool MOVBE(void) { return f_1_ecx[22]; }
|
||||||
|
bool POPCNT(void) { return f_1_ecx[23]; }
|
||||||
|
bool AES(void) { return f_1_ecx[25]; }
|
||||||
|
bool XSAVE(void) { return f_1_ecx[26]; }
|
||||||
|
bool OSXSAVE(void) { return f_1_ecx[27]; }
|
||||||
|
bool AVX(void) { return f_1_ecx[28]; }
|
||||||
|
bool F16C(void) { return f_1_ecx[29]; }
|
||||||
|
bool RDRAND(void) { return f_1_ecx[30]; }
|
||||||
|
|
||||||
|
bool MSR(void) { return f_1_edx[5]; }
|
||||||
|
bool CX8(void) { return f_1_edx[8]; }
|
||||||
|
bool SEP(void) { return f_1_edx[11]; }
|
||||||
|
bool CMOV(void) { return f_1_edx[15]; }
|
||||||
|
bool CLFSH(void) { return f_1_edx[19]; }
|
||||||
|
bool MMX(void) { return f_1_edx[23]; }
|
||||||
|
bool FXSR(void) { return f_1_edx[24]; }
|
||||||
|
bool SSE(void) { return f_1_edx[25]; }
|
||||||
|
bool SSE2(void) { return f_1_edx[26]; }
|
||||||
|
|
||||||
|
bool FSGSBASE(void) { return f_7_ebx[0]; }
|
||||||
|
bool BMI1(void) { return f_7_ebx[3]; }
|
||||||
|
bool HLE(void) { return is_intel && f_7_ebx[4]; }
|
||||||
|
bool AVX2(void) { return f_7_ebx[5]; }
|
||||||
|
bool BMI2(void) { return f_7_ebx[8]; }
|
||||||
|
bool ERMS(void) { return f_7_ebx[9]; }
|
||||||
|
bool INVPCID(void) { return f_7_ebx[10]; }
|
||||||
|
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
||||||
|
bool AVX512F(void) { return f_7_ebx[16]; }
|
||||||
|
bool RDSEED(void) { return f_7_ebx[18]; }
|
||||||
|
bool ADX(void) { return f_7_ebx[19]; }
|
||||||
|
bool AVX512PF(void) { return f_7_ebx[26]; }
|
||||||
|
bool AVX512ER(void) { return f_7_ebx[27]; }
|
||||||
|
bool AVX512CD(void) { return f_7_ebx[28]; }
|
||||||
|
bool SHA(void) { return f_7_ebx[29]; }
|
||||||
|
|
||||||
|
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
|
||||||
|
|
||||||
|
bool LAHF(void) { return f_81_ecx[0]; }
|
||||||
|
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
|
||||||
|
bool ABM(void) { return is_amd && f_81_ecx[5]; }
|
||||||
|
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
|
||||||
|
bool XOP(void) { return is_amd && f_81_ecx[11]; }
|
||||||
|
bool TBM(void) { return is_amd && f_81_ecx[21]; }
|
||||||
|
|
||||||
|
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
|
||||||
|
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
|
||||||
|
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
|
||||||
|
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
|
||||||
|
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
|
||||||
|
|
||||||
|
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
|
||||||
|
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
|
||||||
|
bool AVX512_FP16(void) { return f_7_edx[23]; }
|
||||||
|
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
|
||||||
|
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
|
||||||
|
|
||||||
|
bool AMX_TILE(void) { return f_7_edx[24]; }
|
||||||
|
bool AMX_INT8(void) { return f_7_edx[25]; }
|
||||||
|
bool AMX_FP16(void) { return f_7_1_eax[21]; }
|
||||||
|
bool AMX_BF16(void) { return f_7_edx[22]; }
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
static void cpuid(int cpu_info[4], int eax) {
|
||||||
|
__cpuid(cpu_info, eax);
|
||||||
|
}
|
||||||
|
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
||||||
|
__cpuidex(cpu_info, eax, ecx);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static void cpuid(int cpu_info[4], int eax) {
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"cpuid"
|
||||||
|
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||||
|
: "a"(eax), "c"(0));
|
||||||
|
}
|
||||||
|
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"cpuid"
|
||||||
|
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||||
|
: "a"(eax), "c"(ecx));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cpuid_x86() {
|
||||||
|
std::array<int, 4> cpui;
|
||||||
|
std::vector<std::array<int, 4>> data;
|
||||||
|
|
||||||
|
// calling __cpuid with 0x0 as the function_id argument
|
||||||
|
// gets the number of the highest valid function ID.
|
||||||
|
cpuid(cpui.data(), 0);
|
||||||
|
int n_ids = cpui[0];
|
||||||
|
|
||||||
|
for (int i = 0; i <= n_ids; ++i) {
|
||||||
|
cpuidex(cpui.data(), i, 0);
|
||||||
|
data.push_back(cpui);
|
||||||
|
}
|
||||||
|
|
||||||
|
// capture vendor string
|
||||||
|
char vendor[0x20] = {};
|
||||||
|
*reinterpret_cast<int *>(vendor) = data[0][1];
|
||||||
|
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
|
||||||
|
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
|
||||||
|
this->vendor = vendor;
|
||||||
|
if (this->vendor == "GenuineIntel") {
|
||||||
|
is_intel = true;
|
||||||
|
} else if (this->vendor == "AuthenticAMD") {
|
||||||
|
is_amd = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load bitset with flags for function 0x00000001
|
||||||
|
if (n_ids >= 1) {
|
||||||
|
f_1_ecx = data[1][2];
|
||||||
|
f_1_edx = data[1][3];
|
||||||
|
}
|
||||||
|
|
||||||
|
// load bitset with flags for function 0x00000007
|
||||||
|
if (n_ids >= 7) {
|
||||||
|
f_7_ebx = data[7][1];
|
||||||
|
f_7_ecx = data[7][2];
|
||||||
|
f_7_edx = data[7][3];
|
||||||
|
cpuidex(cpui.data(), 7, 1);
|
||||||
|
f_7_1_eax = cpui[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// calling __cpuid with 0x80000000 as the function_id argument
|
||||||
|
// gets the number of the highest valid extended ID.
|
||||||
|
cpuid(cpui.data(), 0x80000000);
|
||||||
|
unsigned int n_ex_ids = cpui[0];
|
||||||
|
|
||||||
|
std::vector<std::array<int, 4>> ext_data;
|
||||||
|
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
|
||||||
|
cpuidex(cpui.data(), i, 0);
|
||||||
|
ext_data.push_back(cpui);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load bitset with flags for function 0x80000001
|
||||||
|
if (n_ex_ids >= 0x80000001) {
|
||||||
|
f_81_ecx = ext_data[1][2];
|
||||||
|
f_81_edx = ext_data[1][3];
|
||||||
|
}
|
||||||
|
|
||||||
|
// interpret CPU brand string if reported
|
||||||
|
char brand[0x40] = {};
|
||||||
|
if (n_ex_ids >= 0x80000004) {
|
||||||
|
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
|
||||||
|
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
|
||||||
|
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
|
||||||
|
this->brand = brand;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_intel = false;
|
||||||
|
bool is_amd = false;
|
||||||
|
std::string vendor;
|
||||||
|
std::string brand;
|
||||||
|
std::bitset<32> f_1_ecx;
|
||||||
|
std::bitset<32> f_1_edx;
|
||||||
|
std::bitset<32> f_7_ebx;
|
||||||
|
std::bitset<32> f_7_ecx;
|
||||||
|
std::bitset<32> f_7_edx;
|
||||||
|
std::bitset<32> f_7_1_eax;
|
||||||
|
std::bitset<32> f_81_ecx;
|
||||||
|
std::bitset<32> f_81_edx;
|
||||||
|
};
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
void test_x86_is() {
|
||||||
|
cpuid_x86 is;
|
||||||
|
printf("CPU Vendor: %s\n", is.vendor.c_str());
|
||||||
|
printf("Brand: %s\n", is.brand.c_str());
|
||||||
|
printf("is_intel: %d\n", is.is_intel);
|
||||||
|
printf("is_amd: %d\n", is.is_amd);
|
||||||
|
printf("sse3: %d\n", is.SSE3());
|
||||||
|
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
|
||||||
|
printf("ssse3: %d\n", is.SSSE3());
|
||||||
|
printf("fma: %d\n", is.FMA());
|
||||||
|
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
|
||||||
|
printf("sse41: %d\n", is.SSE41());
|
||||||
|
printf("sse42: %d\n", is.SSE42());
|
||||||
|
printf("movbe: %d\n", is.MOVBE());
|
||||||
|
printf("popcnt: %d\n", is.POPCNT());
|
||||||
|
printf("aes: %d\n", is.AES());
|
||||||
|
printf("xsave: %d\n", is.XSAVE());
|
||||||
|
printf("osxsave: %d\n", is.OSXSAVE());
|
||||||
|
printf("avx: %d\n", is.AVX());
|
||||||
|
printf("f16c: %d\n", is.F16C());
|
||||||
|
printf("rdrand: %d\n", is.RDRAND());
|
||||||
|
printf("msr: %d\n", is.MSR());
|
||||||
|
printf("cx8: %d\n", is.CX8());
|
||||||
|
printf("sep: %d\n", is.SEP());
|
||||||
|
printf("cmov: %d\n", is.CMOV());
|
||||||
|
printf("clflush: %d\n", is.CLFSH());
|
||||||
|
printf("mmx: %d\n", is.MMX());
|
||||||
|
printf("fxsr: %d\n", is.FXSR());
|
||||||
|
printf("sse: %d\n", is.SSE());
|
||||||
|
printf("sse2: %d\n", is.SSE2());
|
||||||
|
printf("fsgsbase: %d\n", is.FSGSBASE());
|
||||||
|
printf("bmi1: %d\n", is.BMI1());
|
||||||
|
printf("hle: %d\n", is.HLE());
|
||||||
|
printf("avx2: %d\n", is.AVX2());
|
||||||
|
printf("bmi2: %d\n", is.BMI2());
|
||||||
|
printf("erms: %d\n", is.ERMS());
|
||||||
|
printf("invpcid: %d\n", is.INVPCID());
|
||||||
|
printf("rtm: %d\n", is.RTM());
|
||||||
|
printf("avx512f: %d\n", is.AVX512F());
|
||||||
|
printf("rdseed: %d\n", is.RDSEED());
|
||||||
|
printf("adx: %d\n", is.ADX());
|
||||||
|
printf("avx512pf: %d\n", is.AVX512PF());
|
||||||
|
printf("avx512er: %d\n", is.AVX512ER());
|
||||||
|
printf("avx512cd: %d\n", is.AVX512CD());
|
||||||
|
printf("sha: %d\n", is.SHA());
|
||||||
|
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
|
||||||
|
printf("lahf: %d\n", is.LAHF());
|
||||||
|
printf("lzcnt: %d\n", is.LZCNT());
|
||||||
|
printf("abm: %d\n", is.ABM());
|
||||||
|
printf("sse4a: %d\n", is.SSE4a());
|
||||||
|
printf("xop: %d\n", is.XOP());
|
||||||
|
printf("tbm: %d\n", is.TBM());
|
||||||
|
printf("syscall: %d\n", is.SYSCALL());
|
||||||
|
printf("mmxext: %d\n", is.MMXEXT());
|
||||||
|
printf("rdtscp: %d\n", is.RDTSCP());
|
||||||
|
printf("3dnowext: %d\n", is._3DNOWEXT());
|
||||||
|
printf("3dnow: %d\n", is._3DNOW());
|
||||||
|
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
|
||||||
|
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
|
||||||
|
printf("avx512_fp16: %d\n", is.AVX512_FP16());
|
||||||
|
printf("avx512_bf16: %d\n", is.AVX512_BF16());
|
||||||
|
printf("amx_tile: %d\n", is.AMX_TILE());
|
||||||
|
printf("amx_int8: %d\n", is.AMX_INT8());
|
||||||
|
printf("amx_fp16: %d\n", is.AMX_FP16());
|
||||||
|
printf("amx_bf16: %d\n", is.AMX_BF16());
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static int ggml_backend_cpu_x86_score() {
|
||||||
|
// FIXME: this does not check for OS support
|
||||||
|
|
||||||
|
cpuid_x86 is;
|
||||||
|
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
|
||||||
|
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
|
||||||
|
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
|
||||||
|
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
|
||||||
|
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
|
||||||
|
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
|
||||||
|
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
|
||||||
|
|
||||||
|
// calculate a backend score based on the supported features
|
||||||
|
// more important features have a higher weight
|
||||||
|
int score = 0;
|
||||||
|
score += ggml_cpu_has_fma () * 1;
|
||||||
|
score += ggml_cpu_has_f16c () * 1<<1;
|
||||||
|
score += ggml_cpu_has_ssse3 () * 1<<2;
|
||||||
|
score += ggml_cpu_has_sse3 () * 1<<3;
|
||||||
|
score += ggml_cpu_has_avx_vnni () * 1<<4;
|
||||||
|
score += ggml_cpu_has_avx () * 1<<5;
|
||||||
|
score += ggml_cpu_has_avx2 () * 1<<6;
|
||||||
|
score += ggml_cpu_has_avx512 () * 1<<7;
|
||||||
|
// score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
|
||||||
|
score += ggml_cpu_has_avx512_bf16() * 1<<9;
|
||||||
|
score += ggml_cpu_has_avx512_vnni() * 1<<10;
|
||||||
|
score += ggml_cpu_has_amx_int8 () * 1<<11;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
|
||||||
|
|
||||||
|
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
|
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
|
||||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
#if defined(__AVX512VNNI__)
|
||||||
const __m512i zero = _mm512_setzero_si512();
|
const __m512i zero = _mm512_setzero_si512();
|
||||||
return _mm512_dpbusd_epi32(zero, ax, sy);
|
return _mm512_dpbusd_epi32(zero, ax, sy);
|
||||||
#else
|
#else
|
||||||
|
12
scripts/build-cpu.sh
Executable file
12
scripts/build-cpu.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
name="$1"
|
||||||
|
args="${@:2}"
|
||||||
|
|
||||||
|
echo "Building $name with args: $args"
|
||||||
|
|
||||||
|
rm -fr build-cpu-$1
|
||||||
|
cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args
|
||||||
|
cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc)
|
||||||
|
cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so
|
||||||
|
rm -fr build-cpu-$1
|
Loading…
x
Reference in New Issue
Block a user