ggml : move AMX to the CPU backend (#10570)

* ggml : move AMX to the CPU backend

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Diego Devesa 2024-11-29 21:54:58 +01:00 committed by GitHub
parent b782e5c7d4
commit 7cc2d2c889
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
64 changed files with 514 additions and 801 deletions

View File

@ -17,8 +17,10 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*, performance-*,
portability-*, portability-*,
-portability-simd-intrinsics,
misc-*, misc-*,
-misc-const-correctness, -misc-const-correctness,
-misc-non-private-member-variables-in-classes, -misc-non-private-member-variables-in-classes,
-misc-no-recursion, -misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none FormatStyle: none

View File

@ -1121,6 +1121,11 @@ jobs:
run: | run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ github.job }}
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |

View File

@ -254,8 +254,8 @@ endif
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++17 -fPIC
MK_NVCCFLAGS = -std=c++11 MK_NVCCFLAGS = -std=c++17
ifdef LLAMA_NO_CCACHE ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1 GGML_NO_CCACHE := 1
@ -575,9 +575,12 @@ endif
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
endif endif
# only necessary for the CPU backend files
MK_CPPFLAGS += -Iggml/src/ggml-cpu
ifdef GGML_RPC ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML_EXT += ggml/src/ggml-rpc.o OBJ_GGML_EXT += ggml/src/ggml-rpc.o

View File

@ -28,13 +28,16 @@ var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]), .unsafeFlags(["-fno-objc-arc"]),
.headerSearchPath("ggml/src"), .headerSearchPath("ggml/src"),
.headerSearchPath("ggml/src/ggml-cpu"),
// NOTE: NEW_LAPACK will required iOS version 16.4+ // NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14 // We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64") // .define("ACCELERATE_LAPACK_ILP64")
.define("GGML_USE_CPU"),
] ]
#if canImport(Darwin) #if canImport(Darwin)
sources.append("ggml/src/ggml-common.h") sources.append("ggml/src/ggml-common.h")
sources.append("ggml/src/ggml-metal/ggml-metal.m") sources.append("ggml/src/ggml-metal/ggml-metal.m")
@ -44,7 +47,6 @@ cSettings.append(
contentsOf: [ contentsOf: [
.define("GGML_USE_ACCELERATE"), .define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL"), .define("GGML_USE_METAL"),
.define("GGML_USE_CPU")
] ]
) )
#endif #endif

View File

@ -88,5 +88,5 @@ if (LLAMA_CURL)
endif () endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11) target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

View File

@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
std::u32string filename_utf32; std::u32string filename_utf32;
try { try {
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
filename_utf32 = converter.from_bytes(filename); filename_utf32 = converter.from_bytes(filename);
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,

View File

@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp) add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp) add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp) add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp) add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp) add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp) add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TEST_TARGET test-eval-callback) set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} add_test(NAME ${TEST_TARGET}

View File

@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp) add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp) add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp) add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE sha256)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp) add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gguf)
add_executable(${TARGET} gguf.cpp) add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp) add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp) add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp) add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp) add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..) target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common) target_include_directories(llava PUBLIC ../../common)
target_compile_features(llava PRIVATE cxx_std_11) target_compile_features(llava PRIVATE cxx_std_17)
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>) add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-minicpmv-cli) set(TARGET llama-minicpmv-cli)
add_executable(${TARGET} minicpmv-cli.cpp) add_executable(${TARGET} minicpmv-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
add_executable(${TARGET} lookahead.cpp) add_executable(${TARGET} lookahead.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,22 +2,22 @@ set(TARGET llama-lookup)
add_executable(${TARGET} lookup.cpp) add_executable(${TARGET} lookup.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-create) set(TARGET llama-lookup-create)
add_executable(${TARGET} lookup-create.cpp) add_executable(${TARGET} lookup-create.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-merge) set(TARGET llama-lookup-merge)
add_executable(${TARGET} lookup-merge.cpp) add_executable(${TARGET} lookup-merge.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-lookup-stats) set(TARGET llama-lookup-stats)
add_executable(${TARGET} lookup-stats.cpp) add_executable(${TARGET} lookup-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
target_include_directories(${TARGET} PRIVATE ${_common_path}) target_include_directories(${TARGET} PRIVATE ${_common_path})
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-cli)
add_executable(${TARGET} main.cpp) add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-parallel)
add_executable(${TARGET} parallel.cpp) add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-passkey)
add_executable(${TARGET} passkey.cpp) add_executable(${TARGET} passkey.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
add_executable(${TARGET} perplexity.cpp) add_executable(${TARGET} perplexity.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
add_executable(${TARGET} retrieval.cpp) add_executable(${TARGET} retrieval.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-run)
add_executable(${TARGET} run.cpp) add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
add_executable(${TARGET} save-load-state.cpp) add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -50,4 +50,4 @@ if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
add_executable(${TARGET} simple-chat.cpp) add_executable(${TARGET} simple-chat.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-simple)
add_executable(${TARGET} simple.cpp) add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
add_executable(${TARGET} speculative-simple.cpp) add_executable(${TARGET} speculative-simple.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-speculative)
add_executable(${TARGET} speculative.cpp) add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -2,4 +2,4 @@ set(TARGET llama-tokenize)
add_executable(${TARGET} tokenize.cpp) add_executable(${TARGET} tokenize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -161,7 +161,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON) option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF) option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_TARGET "INTEL" CACHE STRING

View File

@ -1,25 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// buffer_type API
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -261,21 +261,15 @@ function(ggml_add_backend backend)
if (${backend_id}) if (${backend_id})
string(TOLOWER "ggml-${backend}" backend_target) string(TOLOWER "ggml-${backend}" backend_target)
add_subdirectory(${backend_target}) add_subdirectory(${backend_target})
# check again in case the backend disabled itself
# note that this should NOT be the normal behavior, in case of errors the backend should fail the build
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
if (${backend_id})
message(STATUS "Including ${backend} backend") message(STATUS "Including ${backend} backend")
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
string(TOUPPER "GGML_USE_${backend}" backend_use) string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use}) target_compile_definitions(ggml PUBLIC ${backend_use})
endif() endif()
endif() endif()
endif()
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
ggml_add_backend(AMX)
ggml_add_backend(BLAS) ggml_add_backend(BLAS)
ggml_add_backend(CANN) ggml_add_backend(CANN)
ggml_add_backend(CUDA) ggml_add_backend(CUDA)
@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
target_compile_features (${target} PRIVATE c_std_11) # don't bump target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
endforeach() endforeach()
target_link_libraries(ggml-base PRIVATE Threads::Threads) target_link_libraries(ggml-base PRIVATE Threads::Threads)

View File

@ -1,105 +0,0 @@
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
message(STATUS "Using AMX")
file(GLOB GGML_HEADERS_AMX "*.h")
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
file(GLOB GGML_SOURCES_AMX "*.cpp")
ggml_add_backend_library(ggml-amx
${GGML_HEADERS_AMX}
${GGML_SOURCES_AMX}
)
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(../ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
else()
set(GGML_AMX OFF PARENT_SCOPE)
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
endif()

View File

@ -1,449 +0,0 @@
#include "ggml-amx.h"
#include "ggml-amx/common.h"
#include "ggml-amx/mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_amx;
}
// backend interface
static const char * ggml_backend_amx_name(ggml_backend_t backend) {
return "AMX";
GGML_UNUSED(backend);
}
static void ggml_backend_amx_free(ggml_backend_t backend) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
delete ctx;
delete backend;
}
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_amx_mul_mat(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
}
}
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_amx_guid() {
static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
return &guid;
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_t ggml_backend_amx_init() {
// invoke a Linux system call to request access to AMX features
ggml_amx_init();
// backend context
ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
// ggml amx backend
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_amx_guid(),
/* .interface = */ ggml_backend_amx_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ ctx,
};
return backend;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
GGML_ASSERT(ggml_backend_is_amx(backend_amx));
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
ctx->n_threads = n_threads;
}
// device interface
static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
return "AMX";
GGML_UNUSED(dev);
}
static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
return "Intel Advanced Matrix Extensions";
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_amx_device_get_name(dev);
props->description = ggml_backend_amx_device_get_description(dev);
props->type = ggml_backend_amx_device_get_type(dev);
ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
// `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_amx_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
/* .get_name = */ ggml_backend_amx_device_get_name,
/* .get_description = */ ggml_backend_amx_device_get_description,
/* .get_memory = */ ggml_backend_amx_device_get_memory,
/* .get_type = */ ggml_backend_amx_device_get_type,
/* .get_props = */ ggml_backend_amx_device_get_props,
/* .init_backend = */ ggml_backend_amx_device_init,
/* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_amx_device_supports_op,
/* .supports_buft = */ ggml_backend_amx_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
return "AMX";
GGML_UNUSED(reg);
}
static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_device ggml_backend_amx_device = {
/* .iface = */ ggml_backend_amx_device_i,
/* .reg = */ reg,
/* .context = */ nullptr,
};
return &ggml_backend_amx_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_amx_set_n_threads;
}
return NULL;
GGML_UNUSED(reg);
GGML_UNUSED(name);
}
static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
/* .get_name = */ ggml_backend_amx_reg_get_name,
/* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
/* .get_device = */ ggml_backend_amx_reg_get_device,
/* .get_proc_address = */ ggml_backend_amx_get_proc_address,
};
ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_amx_reg;
}
#else // if defined(__AMX_INT8__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
return nullptr;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
GGML_UNUSED(backend);
return false;
}
ggml_backend_t ggml_backend_amx_init(void) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
return nullptr;
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(backend_amx);
GGML_UNUSED(n_threads);
}
ggml_backend_reg_t ggml_backend_amx_reg(void) {
return nullptr;
}
#endif
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)

View File

@ -49,10 +49,6 @@
#include "ggml-rpc.h" #include "ggml-rpc.h"
#endif #endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
#include "ggml-cann.h" #include "ggml-cann.h"
#endif #endif
@ -92,9 +88,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg()); register_backend(ggml_backend_kompute_reg());
#endif #endif

View File

@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend // since the tensor is pre-allocated, it cannot be moved to another backend
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name); ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
} }
// graph input // graph input

View File

@ -1,12 +1,20 @@
ggml_add_backend_library(ggml-cpu ggml_add_backend_library(ggml-cpu)
list (APPEND GGML_CPU_SOURCES
ggml-cpu.c ggml-cpu.c
ggml-cpu.cpp ggml-cpu.cpp
ggml-cpu-aarch64.c ggml-cpu-aarch64.c
ggml-cpu-aarch64.h ggml-cpu-aarch64.h
ggml-cpu-quants.c ggml-cpu-quants.c
ggml-cpu-quants.h ggml-cpu-quants.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
amx/mmq.h
ggml-cpu-impl.h
) )
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
target_include_directories(ggml-cpu PRIVATE .) target_include_directories(ggml-cpu PRIVATE .)
if (APPLE AND GGML_ACCELERATE) if (APPLE AND GGML_ACCELERATE)
@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
if (ACCELERATE_FRAMEWORK) if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found") message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK) target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64) target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
else() else()
@ -29,15 +37,9 @@ if (GGML_OPENMP)
if (OpenMP_FOUND) if (OpenMP_FOUND)
message(STATUS "OpenMP found") message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
# FIXME: should be replaced with a compiler id check
#if (GGML_MUSA)
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
#endif()
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")
endif() endif()
@ -46,9 +48,9 @@ endif()
if (GGML_LLAMAFILE) if (GGML_LLAMAFILE)
message(STATUS "Using llamafile") message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
target_sources(ggml-cpu PRIVATE list(APPEND GGML_CPU_SOURCES
llamafile/sgemm.cpp llamafile/sgemm.cpp
llamafile/sgemm.h) llamafile/sgemm.h)
endif() endif()
@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
message(STATUS "Using memkind for CPU HBM") message(STATUS "Using memkind for CPU HBM")
add_compile_definitions(GGML_USE_CPU_HBM) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
target_link_libraries(ggml-cpu PUBLIC memkind) target_link_libraries(ggml-cpu PUBLIC memkind)
endif() endif()
@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
add_compile_definitions(__ARM_NEON) list(APPEND ARCH_DEFINITIONS __ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled") message(STATUS "ARM feature DOTPROD enabled")
endif () endif ()
@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled") message(STATUS "ARM feature MATMUL_INT8 enabled")
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
endif () endif ()
@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
add_compile_definitions(__ARM_FEATURE_DOTPROD) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled") message(STATUS "ARM feature DOTPROD enabled")
endif () endif ()
@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled") message(STATUS "ARM feature MATMUL_INT8 enabled")
endif () endif ()
@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (MSVC) if (MSVC)
# instruction set detection for MSVC only # instruction set detection for MSVC only
if (GGML_NATIVE) if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(cmake/FindSIMD.cmake) include(cmake/FindSIMD.cmake)
endif () endif ()
if (GGML_AVX512) if (GGML_AVX512)
@ -185,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
# macros corresponding to the extensions. # macros corresponding to the extensions.
# Do it manually. # Do it manually.
if (GGML_AVX512_VBMI) if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vbmi) list(APPEND ARCH_FLAGS -mavx512vbmi)
endif() endif()
endif() endif()
if (GGML_AVX512_VNNI) if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
endif() endif()
if (GGML_AVX512_BF16) if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>) list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512bf16) list(APPEND ARCH_FLAGS -mavx512bf16)
endif() endif()
endif() endif()
if (GGML_AMX_TILE) if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>) list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif() endif()
if (GGML_AMX_INT8) if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>) list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif() endif()
if (GGML_AMX_BF16) if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>) list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif() endif()
elseif (GGML_AVX2) elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
@ -276,7 +271,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le) list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
else() else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected") message(STATUS "loongarch64 detected")
@ -299,11 +294,12 @@ endif()
if (GGML_CPU_AARCH64) if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
add_compile_definitions(GGML_USE_CPU_AARCH64) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
endif() endif()
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
if (EMSCRIPTEN) if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")

View File

@ -0,0 +1,196 @@
#include "amx.h"
#include "common.h"
#include "mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};
if (!ggml_amx_init()) {
return NULL;
}
return &ggml_backend_buffer_type_amx;
}
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)

View File

@ -0,0 +1,20 @@
#include "ggml-backend.h"
#include "ggml-cpu-impl.h"
#ifdef __cplusplus
extern "C" {
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -1,8 +1,7 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
// hack until AMX is moved into the CPU backend #include "ggml-cpu-impl.h"
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
@ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) {
#endif #endif
} }
template <typename func_t>
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
int tbegin, tend;
balance211(n, params->nth, params->ith, tbegin, tend);
f(tbegin, tend);
ggml_barrier(params->threadpool); // TODO: might not always be needed
}
// quantized types that have AMX support // quantized types that have AMX support
inline bool qtype_has_amx_kernels(const enum ggml_type type) { inline bool qtype_has_amx_kernels(const enum ggml_type type) {
// TODO: fix padding for vnni format // TODO: fix padding for vnni format
return (type == GGML_TYPE_Q4_0) || return (type == GGML_TYPE_Q4_0) ||
(type == GGML_TYPE_Q4_1); (type == GGML_TYPE_Q4_1) ||
//(type == GGML_TYPE_Q8_0) || (type == GGML_TYPE_Q8_0) ||
//(type == GGML_TYPE_Q4_K) || (type == GGML_TYPE_Q4_K) ||
//(type == GGML_TYPE_Q5_K) || (type == GGML_TYPE_Q5_K) ||
//(type == GGML_TYPE_Q6_K) || (type == GGML_TYPE_Q6_K) ||
//(type == GGML_TYPE_IQ4_XS); (type == GGML_TYPE_IQ4_XS);
} }
// ggml backend context // ggml backend context

View File

@ -4,8 +4,11 @@
#pragma GCC diagnostic ignored "-Wunused-local-typedefs" #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif #endif
#include "amx.h"
#include "mmq.h" #include "mmq.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu-quants.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include <algorithm> #include <algorithm>
#include <type_traits> #include <type_traits>
@ -33,7 +36,7 @@
#define ALWAYS_INLINE inline #define ALWAYS_INLINE inline
#endif #endif
#if defined(__AMX_INT8__) #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
namespace { namespace {
@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k);
template <> template <>
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) { inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
// FIXME: using unoptimized reference impl until moved to CPU backend quantize_row_q8_0(x, (block_q8_0 *)vy, k);
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
} }
template <> template <>
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) { inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k); quantize_row_q8_1(x, (block_q8_1 *)vy, k);
} }
template <> template <>
@ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>>
void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
GGML_UNUSED(tile); GGML_UNUSED(tile);
GGML_UNUSED(packed_B); GGML_UNUSED(packed_B);
}; }
template <> template <>
void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
@ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
// pack weight to vnni format // pack weight to vnni format
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
GGML_ASSERT(alloc_size == size);
const enum ggml_type TYPE = tensor->type; const enum ggml_type TYPE = tensor->type;
@ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
}); });
} }
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
const enum ggml_type TYPE = src0->type;
const bool is_floating_type = TYPE == GGML_TYPE_F16;
if (is_floating_type) {
return 0;
}
const int M = dst->ne[1];
const int K = src0->ne[0];
size_t desired_wsize = 0;
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
desired_wsize = M * row_size_A;
});
return desired_wsize;
}
// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX) // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
// //
// src0: weight in shape of {N, K}, quantized // src0: weight in shape of {N, K}, quantized
@ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
// //
// the function performs: dst = src1 @ src0.T // the function performs: dst = src1 @ src0.T
// //
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0]; struct ggml_tensor * src0 = dst->src[0];
struct ggml_tensor * src1 = dst->src[1]; struct ggml_tensor * src1 = dst->src[1];
const enum ggml_type TYPE = src0->type; const enum ggml_type TYPE = src0->type;
const int n_threads = ctx->n_threads;
// f16 only has avx512 kernels for now, // f16 only has avx512 kernels for now,
// amx kernels will be added once 6th gen xeon is released. // amx kernels will be added once 6th gen xeon is released.
const bool is_floating_type = TYPE == GGML_TYPE_F16; const bool is_floating_type = TYPE == GGML_TYPE_F16;
@ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
const int MB = div_up(M, BLOCK_M); const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N); const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, MB * NB, [&](int begin, int end) { parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
for (int i = begin; i < end; ++i) { for (int i = begin; i < end; ++i) {
int mb = i / NB; int mb = i / NB;
@ -2412,17 +2433,16 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
} }
// pointer to work space, used convert A from float to quantized type // pointer to work space, used convert A from float to quantized type
void * wdata = nullptr; void * wdata = params->wdata;
//TODO: performance improvement: merge quant A //TODO: performance improvement: merge quant A
if (params->ith == 0) {
GGML_DISPATCH_QTYPES(TYPE, [&] { GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = M * row_size_A; const size_t desired_wsize = M * row_size_A;
if (ctx->work_size < desired_wsize) { if (params->wsize < desired_wsize) {
ctx->work_data.reset(new char[desired_wsize]); GGML_ABORT("insufficient work space size");
ctx->work_size = desired_wsize;
} }
wdata = ctx->work_data.get();
// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
@ -2433,6 +2453,9 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K); from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
} }
}); });
}
ggml_barrier(params->threadpool);
if (M == 1) { if (M == 1) {
// MB = 1 and handle 8 tiles in each block // MB = 1 and handle 8 tiles in each block
@ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
constexpr int BLOCK_N = TILE_N * kTilesN; constexpr int BLOCK_N = TILE_N * kTilesN;
const int NB = div_up(N, BLOCK_N); const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, NB, [&](int begin, int end) { parallel_for_ggml(params, NB, [&](int begin, int end) {
GGML_DISPATCH_QTYPES(TYPE, [&] { GGML_DISPATCH_QTYPES(TYPE, [&] {
const int KB = K / blck_size; const int KB = K / blck_size;
const int TILE_SIZE = get_tile_size<type>(); const int TILE_SIZE = get_tile_size<type>();
@ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
const int MB = div_up(M, BLOCK_M); const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N); const int NB = div_up(N, BLOCK_N);
parallel_for(n_threads, MB * NB, [&](int begin, int end) { parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
// init tile config for each thread // init tile config for each thread
ggml_tile_config_init(); ggml_tile_config_init();
@ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
}); });
} }
#else // if defined(__AMX_INT8__) #endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(ctx);
GGML_UNUSED(dst);
}
#endif // if defined(__AMX_INT8__)

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include "common.h" #include "common.h"
#include <stdint.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst); void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -15,6 +15,18 @@
extern "C" { extern "C" {
#endif #endif
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define m512bh(p) p #define m512bh(p) p
@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
} }
#endif #endif
// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -10,6 +10,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-cpu-quants.h" #include "ggml-cpu-quants.h"
#include "ggml-threading.h" #include "ggml-threading.h"
#include "amx/amx.h"
#include "ggml.h" #include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
@ -624,7 +625,7 @@ do { \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \ } \
res = _mm512_reduce_add_ps(x[0]); \ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0) } while (0)
// TODO: is this optimal ? // TODO: is this optimal ?
@ -674,7 +675,7 @@ do { \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \ } \
res = _mm512_reduce_add_ps(x[0]); \ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0) } while (0)
#define GGML_F16_VEC GGML_F32Cx16 #define GGML_F16_VEC GGML_F32Cx16
@ -685,8 +686,8 @@ do { \
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#elif defined(__AVX__) #elif defined(__AVX__)
#define GGML_SIMD #define GGML_SIMD
@ -1182,22 +1183,22 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
{ \ { \
int offset = GGML_F32_ARR >> 1; \ int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i)t0, 32); \ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
} }
@ -1367,31 +1368,15 @@ struct ggml_compute_state {
int ith; int ith;
}; };
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
// //
// fundamental operations // fundamental operations
// //
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@ -2286,7 +2271,7 @@ struct ggml_state {
static struct ggml_state g_state = {0}; static struct ggml_state g_state = {0};
static void ggml_barrier(struct ggml_threadpool * tp) { void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
if (n_threads == 1) { if (n_threads == 1) {
return; return;
@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
type = (enum ggml_type)(intptr_t)src0->extra; type = (enum ggml_type)(intptr_t)src0->extra;
} }
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
ggml_backend_amx_mul_mat(params, dst);
return;
}
#endif
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
cur = ggml_backend_amx_desired_wsize(node);
}
#endif
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
if (node->src[1]->type != vec_dot_type) { if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur = MAX(cur, cur2);
} }
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:

View File

@ -3,6 +3,7 @@
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-cpu-aarch64.h" #include "ggml-cpu-aarch64.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "amx/amx.h"
#include <cctype> #include <cctype>
#include <string> #include <string>
#include <vector> #include <vector>
@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
static std::vector<ggml_backend_buffer_type_t> bufts = []() { static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts; std::vector<ggml_backend_buffer_type_t> bufts;
#ifdef GGML_USE_CPU_HBM #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
bufts.push_back(ggml_backend_cpu_hbm_buffer_type()); if (ggml_backend_amx_buffer_type()) {
bufts.push_back(ggml_backend_amx_buffer_type());
}
#endif #endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_CPU_AARCH64
if (ggml_backend_cpu_aarch64_buffer_type()) {
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
}
#endif #endif
bufts.push_back(NULL); bufts.push_back(NULL);
@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src1 = op->src[1];
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
return true;
}
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
return false; return false;
} }
} }
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
return ggml_backend_amx_device_supports_op(op);
}
for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
return false;
}
}
#endif
for (int i = 1; i < GGML_MAX_SRC; i++) { for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
return false; return false;
@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
} }
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
supported = supported || ggml_backend_amx_buft_is_amx(buft);
#endif
return supported;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }

View File

@ -50,8 +50,7 @@
#include "sgemm.h" #include "sgemm.h"
#include "ggml-impl.h" #include "ggml-impl.h"
// hack until moved into the CPU backend #include "ggml-cpu-impl.h"
#include "../ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -30,11 +30,13 @@
extern "C" { extern "C" {
#endif #endif
#undef MIN #ifndef MIN
#undef MAX # define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b)) # define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
// required for mmap as gguf only guarantees 32-byte alignment // required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32 #define TENSOR_ALIGNMENT 32

View File

@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
set(TARGET vulkan-shaders-gen) set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp) add_executable(${TARGET} vulkan-shaders-gen.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)

View File

@ -1,9 +1,9 @@
set(TARGET llama-vdot) set(TARGET llama-vdot)
add_executable(${TARGET} vdot.cpp) add_executable(${TARGET} vdot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-q8dot) set(TARGET llama-q8dot)
add_executable(${TARGET} q8dot.cpp) add_executable(${TARGET} q8dot.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -25,7 +25,7 @@ add_library(llama
) )
target_include_directories(llama PUBLIC . ../include) target_include_directories(llama PUBLIC . ../include)
target_compile_features (llama PUBLIC cxx_std_11) # don't bump target_compile_features (llama PUBLIC cxx_std_17) # don't bump
target_link_libraries(llama PUBLIC ggml) target_link_libraries(llama PUBLIC ggml)

View File

@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
} }
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv; std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
return conv.from_bytes(s); return conv.from_bytes(s);
} }

View File

@ -284,7 +284,7 @@ static void test_perf() {
data.reserve(n_vocab); data.reserve(n_vocab);
for (int i = 0; i < n_vocab; i++) { for (int i = 0; i < n_vocab; i++) {
const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f); const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
data.emplace_back(llama_token_data{i, logit, 0.0f}); data.emplace_back(llama_token_data{i, logit, 0.0f});
} }