From 3b8f1ec4b18770531d0b1d792f3edf08254e4f0c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Apr 2024 23:58:26 +0300 Subject: [PATCH] llamafile : tmp disable + build sgemm.o when needed (#6716) * build : sgemm.o only when needed ggml-ci * llamafile : tmp disable due to MoE bug ggml-ci --- CMakeLists.txt | 38 ++++++++++++++++++++++++++------------ Makefile | 28 +++++++++++++++------------- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cc0df3fb..f134a153b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,18 @@ else() set(LLAMA_METAL_DEFAULT OFF) endif() +# TODO: fix this for Android CI +# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191 +#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID") +# set(LLAMA_LLAMAFILE_DEFAULT OFF) +#else() +# set(LLAMA_LLAMAFILE_DEFAULT ON) +#endif() + +# TODO: temporary disable until MoE is fixed +# https://github.com/ggerganov/llama.cpp/pull/6716 +set(LLAMA_LLAMAFILE_DEFAULT OFF) + # general option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF) @@ -88,7 +100,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) -option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ON) +option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) @@ -372,6 +384,9 @@ endif() if (LLAMA_LLAMAFILE) add_compile_definitions(GGML_USE_LLAMAFILE) + + set(GGML_HEADERS_LLAMAFILE sgemm.h) + set(GGML_SOURCES_LLAMAFILE sgemm.cpp) endif() if (LLAMA_QKK_64) @@ -1157,17 +1172,16 @@ add_library(ggml OBJECT ggml-backend.h ggml-quants.c ggml-quants.h - sgemm.cpp - sgemm.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} - ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} - ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} - ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} - ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) diff --git a/Makefile b/Makefile index 9a711743b..760015f29 100644 --- a/Makefile +++ b/Makefile @@ -219,13 +219,6 @@ ifdef LLAMA_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS -# disable ggml.c's use of sgemm.cpp -ifdef LLAMA_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE=0 -else - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE=1 -endif - # warnings WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \ @@ -391,6 +384,15 @@ ifdef LLAMA_OPENBLAS MK_LDFLAGS += $(shell pkg-config --libs openblas) endif # LLAMA_OPENBLAS +# TODO: temporary disable until MoE is fixed +# https://github.com/ggerganov/llama.cpp/pull/6716 +LLAMA_NO_LLAMAFILE := 1 + +ifndef LLAMA_NO_LLAMAFILE + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJS += sgemm.o +endif + ifdef LLAMA_BLIS MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib @@ -487,11 +489,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(NVCC_COMPILE) - endif # LLAMA_CUDA ifdef LLAMA_CLBLAST - MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL) MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL) MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL) @@ -610,6 +610,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI +ifndef LLAMA_NO_LLAMAFILE +sgemm.o: sgemm.cpp sgemm.h ggml.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif + GF_CC := $(CC) include scripts/get-flags.mk @@ -683,16 +688,13 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ -sgemm.o: sgemm.cpp sgemm.h ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ unicode-data.o: unicode-data.cpp unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o sgemm.o +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@