2023-11-13 13:16:23 +01:00
# include "ggml-backend-impl.h"
2023-10-08 19:19:14 +02:00
# include "ggml-alloc.h"
2023-11-13 13:16:23 +01:00
# include "ggml-impl.h"
2023-10-08 19:19:14 +02:00
# include <assert.h>
2023-11-13 13:16:23 +01:00
# include <limits.h>
2023-10-08 19:19:14 +02:00
# include <stdarg.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# define MAX(a, b) ((a) > (b) ? (a) : (b))
2023-12-07 21:26:54 +01:00
// backend buffer type
2024-01-12 20:07:38 +01:00
const char * ggml_backend_buft_name ( ggml_backend_buffer_type_t buft ) {
return buft - > iface . get_name ( buft ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer ( ggml_backend_buffer_type_t buft , size_t size ) {
2023-12-07 21:26:54 +01:00
return buft - > iface . alloc_buffer ( buft , size ) ;
}
size_t ggml_backend_buft_get_alignment ( ggml_backend_buffer_type_t buft ) {
return buft - > iface . get_alignment ( buft ) ;
}
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
size_t ggml_backend_buft_get_max_size ( ggml_backend_buffer_type_t buft ) {
// get_max_size is optional, defaults to SIZE_MAX
if ( buft - > iface . get_max_size ) {
return buft - > iface . get_max_size ( buft ) ;
}
return SIZE_MAX ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL size_t ggml_backend_buft_get_alloc_size ( ggml_backend_buffer_type_t buft , struct ggml_tensor * tensor ) {
2023-12-07 21:26:54 +01:00
// get_alloc_size is optional, defaults to ggml_nbytes
if ( buft - > iface . get_alloc_size ) {
2024-01-26 18:59:43 +01:00
size_t size = buft - > iface . get_alloc_size ( buft , tensor ) ;
assert ( size > = ggml_nbytes ( tensor ) ) ;
return size ;
2023-12-07 21:26:54 +01:00
}
return ggml_nbytes ( tensor ) ;
}
2023-12-21 21:07:46 +01:00
bool ggml_backend_buft_is_host ( ggml_backend_buffer_type_t buft ) {
if ( buft - > iface . is_host ) {
return buft - > iface . is_host ( buft ) ;
}
return false ;
}
2023-10-08 19:19:14 +02:00
// backend buffer
2024-01-16 12:16:33 +01:00
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init (
2023-12-07 21:26:54 +01:00
ggml_backend_buffer_type_t buft ,
2023-10-08 19:19:14 +02:00
struct ggml_backend_buffer_i iface ,
ggml_backend_buffer_context_t context ,
size_t size ) {
ggml_backend_buffer_t buffer = malloc ( sizeof ( struct ggml_backend_buffer ) ) ;
( * buffer ) = ( struct ggml_backend_buffer ) {
/* .interface = */ iface ,
2023-12-07 21:26:54 +01:00
/* .buft = */ buft ,
2023-10-08 19:19:14 +02:00
/* .context = */ context ,
/* .size = */ size ,
2024-01-12 20:07:38 +01:00
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
2023-10-08 19:19:14 +02:00
} ;
return buffer ;
}
2024-01-12 20:07:38 +01:00
const char * ggml_backend_buffer_name ( ggml_backend_buffer_t buffer ) {
return buffer - > iface . get_name ( buffer ) ;
}
2023-10-08 19:19:14 +02:00
void ggml_backend_buffer_free ( ggml_backend_buffer_t buffer ) {
2023-11-13 13:16:23 +01:00
if ( buffer = = NULL ) {
return ;
}
2023-10-08 19:19:14 +02:00
if ( buffer - > iface . free_buffer ! = NULL ) {
buffer - > iface . free_buffer ( buffer ) ;
}
free ( buffer ) ;
}
size_t ggml_backend_buffer_get_size ( ggml_backend_buffer_t buffer ) {
return buffer - > size ;
}
2023-11-13 13:16:23 +01:00
void * ggml_backend_buffer_get_base ( ggml_backend_buffer_t buffer ) {
void * base = buffer - > iface . get_base ( buffer ) ;
GGML_ASSERT ( base ! = NULL & & " backend buffer base cannot be NULL " ) ;
return base ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL void ggml_backend_buffer_init_tensor ( ggml_backend_buffer_t buffer , struct ggml_tensor * tensor ) {
2023-11-13 13:16:23 +01:00
// init_tensor is optional
2023-10-08 19:19:14 +02:00
if ( buffer - > iface . init_tensor ) {
buffer - > iface . init_tensor ( buffer , tensor ) ;
}
}
2023-12-07 21:26:54 +01:00
size_t ggml_backend_buffer_get_alignment ( ggml_backend_buffer_t buffer ) {
2024-01-12 20:07:38 +01:00
return ggml_backend_buft_get_alignment ( ggml_backend_buffer_get_type ( buffer ) ) ;
2023-10-08 19:19:14 +02:00
}
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
size_t ggml_backend_buffer_get_max_size ( ggml_backend_buffer_t buffer ) {
return ggml_backend_buft_get_max_size ( ggml_backend_buffer_get_type ( buffer ) ) ;
}
2023-12-07 21:26:54 +01:00
size_t ggml_backend_buffer_get_alloc_size ( ggml_backend_buffer_t buffer , struct ggml_tensor * tensor ) {
2024-01-12 20:07:38 +01:00
return ggml_backend_buft_get_alloc_size ( ggml_backend_buffer_get_type ( buffer ) , tensor ) ;
2023-12-07 21:26:54 +01:00
}
2023-10-08 19:19:14 +02:00
2023-12-21 21:07:46 +01:00
void ggml_backend_buffer_clear ( ggml_backend_buffer_t buffer , uint8_t value ) {
buffer - > iface . clear ( buffer , value ) ;
}
bool ggml_backend_buffer_is_host ( ggml_backend_buffer_t buffer ) {
2024-01-12 20:07:38 +01:00
return ggml_backend_buft_is_host ( ggml_backend_buffer_get_type ( buffer ) ) ;
2023-12-21 21:07:46 +01:00
}
2024-01-12 20:07:38 +01:00
void ggml_backend_buffer_set_usage ( ggml_backend_buffer_t buffer , enum ggml_backend_buffer_usage usage ) {
buffer - > usage = usage ;
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
// FIXME: add a generic callback to the buffer interface
if ( ggml_backend_buffer_is_multi_buffer ( buffer ) ) {
ggml_backend_multi_buffer_set_usage ( buffer , usage ) ;
}
2024-01-12 20:07:38 +01:00
}
ggml_backend_buffer_type_t ggml_backend_buffer_get_type ( ggml_backend_buffer_t buffer ) {
2023-12-07 21:26:54 +01:00
return buffer - > buft ;
2023-10-08 19:19:14 +02:00
}
2024-01-12 20:07:38 +01:00
void ggml_backend_buffer_reset ( ggml_backend_buffer_t buffer ) {
if ( buffer - > iface . reset ) {
buffer - > iface . reset ( buffer ) ;
}
}
bool ggml_backend_buffer_copy_tensor ( const struct ggml_tensor * src , struct ggml_tensor * dst ) {
ggml_backend_buffer_t dst_buf = dst - > view_src ? dst - > view_src - > buffer : dst - > buffer ;
if ( dst_buf - > iface . cpy_tensor ) {
2024-06-03 19:03:26 +02:00
return dst_buf - > iface . cpy_tensor ( dst_buf , src , dst ) ;
2024-01-12 20:07:38 +01:00
}
return false ;
}
2023-12-07 21:26:54 +01:00
// backend
2024-02-24 17:27:36 +01:00
ggml_guid_t ggml_backend_guid ( ggml_backend_t backend ) {
if ( backend = = NULL ) {
return NULL ;
}
return backend - > guid ;
}
2023-10-08 19:19:14 +02:00
const char * ggml_backend_name ( ggml_backend_t backend ) {
2023-11-13 13:16:23 +01:00
if ( backend = = NULL ) {
return " NULL " ;
}
2023-10-08 19:19:14 +02:00
return backend - > iface . get_name ( backend ) ;
}
void ggml_backend_free ( ggml_backend_t backend ) {
2023-11-13 13:16:23 +01:00
if ( backend = = NULL ) {
return ;
}
2023-10-08 19:19:14 +02:00
backend - > iface . free ( backend ) ;
}
2023-12-07 21:26:54 +01:00
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type ( ggml_backend_t backend ) {
return backend - > iface . get_default_buffer_type ( backend ) ;
}
2023-10-08 19:19:14 +02:00
ggml_backend_buffer_t ggml_backend_alloc_buffer ( ggml_backend_t backend , size_t size ) {
2023-12-07 21:26:54 +01:00
return ggml_backend_buft_alloc_buffer ( ggml_backend_get_default_buffer_type ( backend ) , size ) ;
2023-10-08 19:19:14 +02:00
}
size_t ggml_backend_get_alignment ( ggml_backend_t backend ) {
2023-12-07 21:26:54 +01:00
return ggml_backend_buft_get_alignment ( ggml_backend_get_default_buffer_type ( backend ) ) ;
2023-10-08 19:19:14 +02:00
}
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
size_t ggml_backend_get_max_size ( ggml_backend_t backend ) {
return ggml_backend_buft_get_max_size ( ggml_backend_get_default_buffer_type ( backend ) ) ;
}
2023-12-07 21:26:54 +01:00
void ggml_backend_tensor_set_async ( ggml_backend_t backend , struct ggml_tensor * tensor , const void * data , size_t offset , size_t size ) {
GGML_ASSERT ( tensor - > data ! = NULL & & " tensor not allocated " ) ;
GGML_ASSERT ( offset + size < = ggml_nbytes ( tensor ) & & " tensor write out of bounds " ) ;
2024-01-12 20:07:38 +01:00
if ( backend - > iface . set_tensor_async = = NULL ) {
ggml_backend_tensor_set ( tensor , data , offset , size ) ;
} else {
backend - > iface . set_tensor_async ( backend , tensor , data , offset , size ) ;
}
2023-10-08 19:19:14 +02:00
}
2023-12-07 21:26:54 +01:00
void ggml_backend_tensor_get_async ( ggml_backend_t backend , const struct ggml_tensor * tensor , void * data , size_t offset , size_t size ) {
GGML_ASSERT ( tensor - > data ! = NULL & & " tensor not allocated " ) ;
GGML_ASSERT ( offset + size < = ggml_nbytes ( tensor ) & & " tensor read out of bounds " ) ;
2024-01-12 20:07:38 +01:00
if ( backend - > iface . get_tensor_async = = NULL ) {
ggml_backend_tensor_get ( tensor , data , offset , size ) ;
} else {
backend - > iface . get_tensor_async ( backend , tensor , data , offset , size ) ;
}
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL void ggml_backend_tensor_set ( struct ggml_tensor * tensor , const void * data , size_t offset , size_t size ) {
2024-01-12 20:07:38 +01:00
ggml_backend_buffer_t buf = tensor - > view_src ? tensor - > view_src - > buffer : tensor - > buffer ;
GGML_ASSERT ( buf ! = NULL & & " tensor buffer not set " ) ;
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( tensor - > data ! = NULL & & " tensor not allocated " ) ;
2023-12-07 21:26:54 +01:00
GGML_ASSERT ( offset + size < = ggml_nbytes ( tensor ) & & " tensor write out of bounds " ) ;
2023-11-13 13:16:23 +01:00
2024-02-13 22:44:25 +01:00
if ( ! size ) {
return ;
}
2024-03-13 18:54:21 +01:00
buf - > iface . set_tensor ( buf , tensor , data , offset , size ) ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL void ggml_backend_tensor_get ( const struct ggml_tensor * tensor , void * data , size_t offset , size_t size ) {
2024-01-12 20:07:38 +01:00
ggml_backend_buffer_t buf = tensor - > view_src ? tensor - > view_src - > buffer : tensor - > buffer ;
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( buf ! = NULL & & " tensor buffer not set " ) ;
2023-11-13 13:16:23 +01:00
GGML_ASSERT ( tensor - > data ! = NULL & & " tensor not allocated " ) ;
2023-12-07 21:26:54 +01:00
GGML_ASSERT ( offset + size < = ggml_nbytes ( tensor ) & & " tensor read out of bounds " ) ;
2023-11-13 13:16:23 +01:00
2024-02-13 22:44:25 +01:00
if ( ! size ) {
return ;
}
2024-03-13 18:54:21 +01:00
buf - > iface . get_tensor ( buf , tensor , data , offset , size ) ;
2023-10-08 19:19:14 +02:00
}
void ggml_backend_synchronize ( ggml_backend_t backend ) {
2023-12-07 21:26:54 +01:00
if ( backend - > iface . synchronize = = NULL ) {
return ;
}
2023-10-08 19:19:14 +02:00
backend - > iface . synchronize ( backend ) ;
}
ggml_backend_graph_plan_t ggml_backend_graph_plan_create ( ggml_backend_t backend , struct ggml_cgraph * cgraph ) {
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( backend - > iface . graph_plan_create ! = NULL ) ;
2023-10-08 19:19:14 +02:00
return backend - > iface . graph_plan_create ( backend , cgraph ) ;
}
void ggml_backend_graph_plan_free ( ggml_backend_t backend , ggml_backend_graph_plan_t plan ) {
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( backend - > iface . graph_plan_free ! = NULL ) ;
2023-10-08 19:19:14 +02:00
backend - > iface . graph_plan_free ( backend , plan ) ;
}
2024-03-04 10:05:42 +01:00
enum ggml_status ggml_backend_graph_plan_compute ( ggml_backend_t backend , ggml_backend_graph_plan_t plan ) {
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( backend - > iface . graph_plan_compute ! = NULL ) ;
2024-03-04 10:05:42 +01:00
return backend - > iface . graph_plan_compute ( backend , plan ) ;
2023-10-08 19:19:14 +02:00
}
2024-03-04 10:05:42 +01:00
enum ggml_status ggml_backend_graph_compute ( ggml_backend_t backend , struct ggml_cgraph * cgraph ) {
2024-03-13 18:54:21 +01:00
enum ggml_status err = ggml_backend_graph_compute_async ( backend , cgraph ) ;
ggml_backend_synchronize ( backend ) ;
return err ;
}
2024-03-18 11:03:04 +01:00
enum ggml_status ggml_backend_graph_compute_async ( ggml_backend_t backend , struct ggml_cgraph * cgraph ) {
2024-01-12 20:07:38 +01:00
return backend - > iface . graph_compute ( backend , cgraph ) ;
2023-10-08 19:19:14 +02:00
}
bool ggml_backend_supports_op ( ggml_backend_t backend , const struct ggml_tensor * op ) {
return backend - > iface . supports_op ( backend , op ) ;
}
2024-06-13 03:11:35 +02:00
bool ggml_backend_supports_buft ( ggml_backend_t backend , ggml_backend_buffer_type_t buft ) {
return backend - > iface . supports_buft ( backend , buft ) ;
}
2024-03-18 11:03:04 +01:00
bool ggml_backend_offload_op ( ggml_backend_t backend , const struct ggml_tensor * op ) {
if ( backend - > iface . offload_op ! = NULL ) {
return backend - > iface . offload_op ( backend , op ) ;
}
return false ;
}
2023-10-08 19:19:14 +02:00
// backend copy
static bool ggml_are_same_layout ( const struct ggml_tensor * a , const struct ggml_tensor * b ) {
if ( a - > type ! = b - > type ) {
return false ;
}
for ( int i = 0 ; i < GGML_MAX_DIMS ; i + + ) {
if ( a - > ne [ i ] ! = b - > ne [ i ] ) {
return false ;
}
if ( a - > nb [ i ] ! = b - > nb [ i ] ) {
return false ;
}
}
return true ;
}
void ggml_backend_tensor_copy ( struct ggml_tensor * src , struct ggml_tensor * dst ) {
GGML_ASSERT ( ggml_are_same_layout ( src , dst ) & & " cannot copy tensors with different layouts " ) ;
if ( src = = dst ) {
return ;
}
2024-01-12 20:07:38 +01:00
if ( ggml_backend_buffer_is_host ( src - > buffer ) ) {
ggml_backend_tensor_set ( dst , src - > data , 0 , ggml_nbytes ( src ) ) ;
} else if ( ggml_backend_buffer_is_host ( dst - > buffer ) ) {
ggml_backend_tensor_get ( src , dst - > data , 0 , ggml_nbytes ( src ) ) ;
} else if ( ! ggml_backend_buffer_copy_tensor ( src , dst ) ) {
# ifndef NDEBUG
fprintf ( stderr , " %s: warning: slow copy from %s to %s \n " , __func__ , ggml_backend_buffer_name ( src - > buffer ) , ggml_backend_buffer_name ( dst - > buffer ) ) ;
# endif
2023-10-08 19:19:14 +02:00
size_t nbytes = ggml_nbytes ( src ) ;
void * data = malloc ( nbytes ) ;
ggml_backend_tensor_get ( src , data , 0 , nbytes ) ;
ggml_backend_tensor_set ( dst , data , 0 , nbytes ) ;
free ( data ) ;
}
}
2024-03-13 18:54:21 +01:00
void ggml_backend_tensor_copy_async ( ggml_backend_t backend_src , ggml_backend_t backend_dst , struct ggml_tensor * src , struct ggml_tensor * dst ) {
2024-01-12 20:07:38 +01:00
GGML_ASSERT ( ggml_are_same_layout ( src , dst ) & & " cannot copy tensors with different layouts " ) ;
if ( src = = dst ) {
return ;
}
2024-03-13 18:54:21 +01:00
if ( backend_dst - > iface . cpy_tensor_async ! = NULL ) {
if ( backend_dst - > iface . cpy_tensor_async ( backend_src , backend_dst , src , dst ) ) {
return ;
2024-01-12 20:07:38 +01:00
}
}
2024-03-13 18:54:21 +01:00
// an async copy would normally happen after all the queued operations on both backends are completed
// sync src, set_async dst
2024-01-12 20:07:38 +01:00
if ( ggml_backend_buffer_is_host ( src - > buffer ) ) {
2024-03-13 18:54:21 +01:00
ggml_backend_synchronize ( backend_src ) ;
ggml_backend_tensor_set_async ( backend_dst , dst , src - > data , 0 , ggml_nbytes ( src ) ) ;
} else {
ggml_backend_synchronize ( backend_src ) ;
2024-01-12 20:07:38 +01:00
ggml_backend_tensor_copy ( src , dst ) ;
2024-03-13 18:54:21 +01:00
ggml_backend_synchronize ( backend_dst ) ;
}
}
// events
ggml_backend_event_t ggml_backend_event_new ( ggml_backend_t backend ) {
if ( backend - > iface . event_new = = NULL ) {
return NULL ;
}
return backend - > iface . event_new ( backend ) ;
}
void ggml_backend_event_free ( ggml_backend_event_t event ) {
if ( event = = NULL ) {
return ;
2024-01-12 20:07:38 +01:00
}
2024-03-13 18:54:21 +01:00
event - > backend - > iface . event_free ( event ) ;
}
void ggml_backend_event_record ( ggml_backend_event_t event ) {
GGML_ASSERT ( event - > backend - > iface . event_record ! = NULL ) ;
event - > backend - > iface . event_record ( event ) ;
2024-01-12 20:07:38 +01:00
}
2024-03-13 18:54:21 +01:00
void ggml_backend_event_synchronize ( ggml_backend_event_t event ) {
GGML_ASSERT ( event - > backend - > iface . event_synchronize ! = NULL ) ;
event - > backend - > iface . event_synchronize ( event ) ;
}
void ggml_backend_event_wait ( ggml_backend_t backend , ggml_backend_event_t event ) {
GGML_ASSERT ( backend - > iface . event_wait ! = NULL ) ;
backend - > iface . event_wait ( backend , event ) ;
}
2024-01-12 20:07:38 +01:00
2023-12-07 21:26:54 +01:00
// backend registry
2023-10-08 19:19:14 +02:00
2024-03-13 18:54:21 +01:00
# define GGML_REG_MAX_BACKENDS 16
2023-12-07 21:26:54 +01:00
struct ggml_backend_reg {
char name [ 128 ] ;
ggml_backend_init_fn init_fn ;
ggml_backend_buffer_type_t default_buffer_type ;
void * user_data ;
2023-10-08 19:19:14 +02:00
} ;
2024-03-13 18:54:21 +01:00
static struct ggml_backend_reg ggml_backend_registry [ GGML_REG_MAX_BACKENDS ] ;
2023-12-07 21:26:54 +01:00
static size_t ggml_backend_registry_count = 0 ;
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init ( const char * params , void * user_data ) ;
2023-12-07 21:26:54 +01:00
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_registry_init ( void ) {
2023-12-07 21:26:54 +01:00
static bool initialized = false ;
if ( initialized ) {
return ;
}
initialized = true ;
2023-10-08 19:19:14 +02:00
2023-12-07 21:26:54 +01:00
ggml_backend_register ( " CPU " , ggml_backend_reg_cpu_init , ggml_backend_cpu_buffer_type ( ) , NULL ) ;
// add forward decls here to avoid including the backend headers
2024-03-26 01:16:01 +01:00
# ifdef GGML_USE_CUDA
2024-01-16 12:16:33 +01:00
extern GGML_CALL void ggml_backend_cuda_reg_devices ( void ) ;
2023-12-07 21:26:54 +01:00
ggml_backend_cuda_reg_devices ( ) ;
# endif
ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration
* update init_cublas
* add debug functio, commit all help code
* step 1
* step 2
* step3 add fp16, slower 31->28
* add GGML_LIST_DEVICE function
* step 5 format device and print
* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue
* support main device is non-zero
* step7 add debug for code path, rm log
* step 8, rename all macro & func from cuda by sycl
* fix error of select non-zero device, format device list
* ren ggml-sycl.hpp -> ggml-sycl.h
* clear CMAKE to rm unused lib and options
* correct queue: rm dtct:get_queue
* add print tensor function to debug
* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481
* summary dpct definition in one header file to replace folder:dpct
* refactor device log
* mv dpct definition from folder dpct to ggml-sycl.h
* update readme, refactor build script
* fix build with sycl
* set nthread=1 when sycl, increase performance
* add run script, comment debug code
* add ls-sycl-device tool
* add ls-sycl-device, rm unused files
* rm rear space
* dos2unix
* Update README_sycl.md
* fix return type
* remove sycl version from include path
* restore rm code to fix hang issue
* add syc and link for sycl readme
* rm original sycl code before refactor
* fix code err
* add know issue for pvc hang issue
* enable SYCL_F16 support
* align pr4766
* check for sycl blas, better performance
* cleanup 1
* remove extra endif
* add build&run script, clean CMakefile, update guide by review comments
* rename macro to intel hardware
* editor config format
* format fixes
* format fixes
* editor format fix
* Remove unused headers
* skip build sycl tool for other code path
* replace tab by space
* fix blas matmul function
* fix mac build
* restore hip dependency
* fix conflict
* ren as review comments
* mv internal function to .cpp file
* export funciton print_sycl_devices(), mv class dpct definition to source file
* update CI/action for sycl code, fix CI error of repeat/dup
* fix action ID format issue
* rm unused strategy
* enable llama_f16 in ci
* fix conflict
* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml
* fix ci cases for unsupported data type
* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL
* revert hip cmake changes
* fix indent
* add prefix in func name
* revert no mmq
* rm cpu blas duplicate
* fix no_new_line
* fix src1->type==F16 bug.
* pass batch offset for F16 src1
* fix batch error
* fix wrong code
* revert sycl checking in test-sampling
* pass void as arguments of ggml_backend_sycl_print_sycl_devices
* remove extra blank line in test-sampling
* revert setting n_threads in sycl
* implement std::isinf for icpx with fast math.
* Update ci/run.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update examples/sycl/run-llama2.sh
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update CMakeLists.txt
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add copyright and MIT license declare
* update the cmd example
---------
Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 16:56:23 +01:00
# ifdef GGML_USE_SYCL
extern void ggml_backend_sycl_reg_devices ( void ) ;
ggml_backend_sycl_reg_devices ( ) ;
# endif
2023-12-07 21:26:54 +01:00
# ifdef GGML_USE_METAL
2024-01-16 12:16:33 +01:00
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init ( const char * params , void * user_data ) ;
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type ( void ) ;
2023-12-07 21:26:54 +01:00
ggml_backend_register ( " Metal " , ggml_backend_reg_metal_init , ggml_backend_metal_buffer_type ( ) , NULL ) ;
# endif
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
# ifdef GGML_USE_VULKAN
extern GGML_CALL int ggml_backend_vk_reg_devices ( void ) ;
ggml_backend_vk_reg_devices ( ) ;
# endif
2024-01-29 21:50:50 +01:00
# ifdef GGML_USE_KOMPUTE
extern GGML_CALL void ggml_backend_kompute_reg_devices ( void ) ;
ggml_backend_kompute_reg_devices ( ) ;
# endif
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL void ggml_backend_register ( const char * name , ggml_backend_init_fn init_fn , ggml_backend_buffer_type_t default_buffer_type , void * user_data ) {
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( ggml_backend_registry_count < GGML_REG_MAX_BACKENDS ) ;
2023-12-07 21:26:54 +01:00
2023-12-24 14:34:22 +01:00
size_t id = ggml_backend_registry_count ;
2023-12-07 21:26:54 +01:00
ggml_backend_registry [ id ] = ( struct ggml_backend_reg ) {
/* .name = */ { 0 } ,
/* .fn = */ init_fn ,
/* .default_buffer_type = */ default_buffer_type ,
/* .user_data = */ user_data ,
} ;
snprintf ( ggml_backend_registry [ id ] . name , sizeof ( ggml_backend_registry [ id ] . name ) , " %s " , name ) ;
# ifndef NDEBUG
fprintf ( stderr , " %s: registered backend %s \n " , __func__ , name ) ;
# endif
ggml_backend_registry_count + + ;
}
size_t ggml_backend_reg_get_count ( void ) {
ggml_backend_registry_init ( ) ;
return ggml_backend_registry_count ;
}
size_t ggml_backend_reg_find_by_name ( const char * name ) {
ggml_backend_registry_init ( ) ;
for ( size_t i = 0 ; i < ggml_backend_registry_count ; i + + ) {
// TODO: case insensitive in a portable way
if ( strcmp ( ggml_backend_registry [ i ] . name , name ) = = 0 ) {
return i ;
}
}
2023-12-24 14:34:22 +01:00
// not found
2023-12-07 21:26:54 +01:00
return SIZE_MAX ;
}
// init from backend:params string
ggml_backend_t ggml_backend_reg_init_backend_from_str ( const char * backend_str ) {
ggml_backend_registry_init ( ) ;
const char * params = strchr ( backend_str , ' : ' ) ;
char backend_name [ 128 ] ;
if ( params = = NULL ) {
2023-12-24 14:34:22 +01:00
snprintf ( backend_name , sizeof ( backend_name ) , " %s " , backend_str ) ;
2023-12-07 21:26:54 +01:00
params = " " ;
} else {
2023-12-24 14:34:22 +01:00
snprintf ( backend_name , sizeof ( backend_name ) , " %.*s " , ( int ) ( params - backend_str ) , backend_str ) ;
2023-12-07 21:26:54 +01:00
params + + ;
}
size_t backend_i = ggml_backend_reg_find_by_name ( backend_name ) ;
2023-12-24 14:34:22 +01:00
2023-12-07 21:26:54 +01:00
if ( backend_i = = SIZE_MAX ) {
fprintf ( stderr , " %s: backend %s not found \n " , __func__ , backend_name ) ;
return NULL ;
}
return ggml_backend_reg_init_backend ( backend_i , params ) ;
}
const char * ggml_backend_reg_get_name ( size_t i ) {
ggml_backend_registry_init ( ) ;
GGML_ASSERT ( i < ggml_backend_registry_count ) ;
return ggml_backend_registry [ i ] . name ;
}
ggml_backend_t ggml_backend_reg_init_backend ( size_t i , const char * params ) {
ggml_backend_registry_init ( ) ;
GGML_ASSERT ( i < ggml_backend_registry_count ) ;
return ggml_backend_registry [ i ] . init_fn ( params , ggml_backend_registry [ i ] . user_data ) ;
}
ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type ( size_t i ) {
ggml_backend_registry_init ( ) ;
GGML_ASSERT ( i < ggml_backend_registry_count ) ;
return ggml_backend_registry [ i ] . default_buffer_type ;
}
ggml_backend_buffer_t ggml_backend_reg_alloc_buffer ( size_t i , size_t size ) {
ggml_backend_registry_init ( ) ;
GGML_ASSERT ( i < ggml_backend_registry_count ) ;
return ggml_backend_buft_alloc_buffer ( ggml_backend_registry [ i ] . default_buffer_type , size ) ;
2023-10-08 19:19:14 +02:00
}
2023-12-07 21:26:54 +01:00
// backend CPU
2024-02-12 08:16:06 +01:00
static const size_t TENSOR_ALIGNMENT = 32 ; // required for mmap as gguf only guarantees 32-byte alignment
2024-01-16 12:16:33 +01:00
GGML_CALL static const char * ggml_backend_cpu_buffer_name ( ggml_backend_buffer_t buffer ) {
2024-01-12 20:07:38 +01:00
return " CPU " ;
GGML_UNUSED ( buffer ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void * ggml_backend_cpu_buffer_get_base ( ggml_backend_buffer_t buffer ) {
2024-02-12 08:16:06 +01:00
uintptr_t data = ( uintptr_t ) buffer - > context ;
// align the buffer
if ( data % TENSOR_ALIGNMENT ! = 0 ) {
data = GGML_PAD ( data , TENSOR_ALIGNMENT ) ;
}
return ( void * ) data ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer ( ggml_backend_buffer_t buffer ) {
2023-10-08 19:19:14 +02:00
free ( buffer - > context ) ;
2023-12-07 21:26:54 +01:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor ( ggml_backend_buffer_t buffer , struct ggml_tensor * tensor , const void * data , size_t offset , size_t size ) {
2023-12-07 21:26:54 +01:00
memcpy ( ( char * ) tensor - > data + offset , data , size ) ;
GGML_UNUSED ( buffer ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor ( ggml_backend_buffer_t buffer , const struct ggml_tensor * tensor , void * data , size_t offset , size_t size ) {
2023-12-07 21:26:54 +01:00
memcpy ( data , ( const char * ) tensor - > data + offset , size ) ;
GGML_UNUSED ( buffer ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor ( ggml_backend_buffer_t buffer , const struct ggml_tensor * src , struct ggml_tensor * dst ) {
2024-01-12 20:07:38 +01:00
if ( ggml_backend_buffer_is_host ( src - > buffer ) ) {
memcpy ( dst - > data , src - > data , ggml_nbytes ( src ) ) ;
return true ;
}
return false ;
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( buffer ) ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_buffer_clear ( ggml_backend_buffer_t buffer , uint8_t value ) {
2023-12-21 21:07:46 +01:00
memset ( buffer - > context , value , buffer - > size ) ;
}
2023-10-08 19:19:14 +02:00
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
2024-01-12 20:07:38 +01:00
/* .get_name = */ ggml_backend_cpu_buffer_name ,
2023-12-07 21:26:54 +01:00
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer ,
/* .get_base = */ ggml_backend_cpu_buffer_get_base ,
/* .init_tensor = */ NULL , // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor ,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor ,
2024-01-12 20:07:38 +01:00
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor ,
2023-12-21 21:07:46 +01:00
/* .clear = */ ggml_backend_cpu_buffer_clear ,
2024-01-12 20:07:38 +01:00
/* .reset = */ NULL ,
2023-10-08 19:19:14 +02:00
} ;
// for buffers from ptr, free is not called
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
2024-01-12 20:07:38 +01:00
/* .get_name = */ ggml_backend_cpu_buffer_name ,
2023-12-07 21:26:54 +01:00
/* .free_buffer = */ NULL , // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base ,
/* .init_tensor = */ NULL , // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor ,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor ,
2024-01-12 20:07:38 +01:00
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor ,
2023-12-21 21:07:46 +01:00
/* .clear = */ ggml_backend_cpu_buffer_clear ,
2024-01-12 20:07:38 +01:00
/* .reset = */ NULL ,
2023-10-08 19:19:14 +02:00
} ;
2024-01-16 12:16:33 +01:00
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name ( ggml_backend_buffer_type_t buft ) {
2024-01-12 20:07:38 +01:00
return " CPU " ;
GGML_UNUSED ( buft ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer ( ggml_backend_buffer_type_t buft , size_t size ) {
2023-10-08 19:19:14 +02:00
size + = TENSOR_ALIGNMENT ; // malloc may return an address that is not aligned
2024-02-12 08:16:06 +01:00
void * data = malloc ( size ) ; // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
if ( data = = NULL ) {
fprintf ( stderr , " %s: failed to allocate buffer of size %zu \n " , __func__ , size ) ;
return NULL ;
}
2023-11-13 13:16:23 +01:00
2023-12-07 21:26:54 +01:00
return ggml_backend_buffer_init ( buft , cpu_backend_buffer_i , data , size ) ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment ( ggml_backend_buffer_type_t buft ) {
2023-10-08 19:19:14 +02:00
return TENSOR_ALIGNMENT ;
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( buft ) ;
}
2023-10-08 19:19:14 +02:00
2024-01-16 12:16:33 +01:00
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host ( ggml_backend_buffer_type_t buft ) {
2023-12-21 21:07:46 +01:00
return true ;
GGML_UNUSED ( buft ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type ( void ) {
2023-12-21 21:07:46 +01:00
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
2023-12-07 21:26:54 +01:00
/* .iface = */ {
2024-01-12 20:07:38 +01:00
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name ,
2023-12-07 21:26:54 +01:00
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer ,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment ,
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
/* .get_max_size = */ NULL , // defaults to SIZE_MAX
2023-12-07 21:26:54 +01:00
/* .get_alloc_size = */ NULL , // defaults to ggml_nbytes
2023-12-21 21:07:46 +01:00
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host ,
2023-12-07 21:26:54 +01:00
} ,
/* .context = */ NULL ,
} ;
2023-10-08 19:19:14 +02:00
2023-12-21 21:07:46 +01:00
return & ggml_backend_cpu_buffer_type ;
}
# ifdef GGML_USE_CPU_HBM
// buffer type HBM
# include <hbwmalloc.h>
2024-01-16 12:16:33 +01:00
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name ( ggml_backend_buffer_type_t buft ) {
2024-01-12 20:07:38 +01:00
return " CPU_HBM " ;
GGML_UNUSED ( buft ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name ( ggml_backend_buffer_t buf ) {
2024-01-12 20:07:38 +01:00
return " CPU_HBM " ;
GGML_UNUSED ( buf ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer ( ggml_backend_buffer_t buffer ) {
2023-12-21 21:07:46 +01:00
hbw_free ( buffer - > context ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer ( ggml_backend_buffer_type_t buft , size_t size ) {
2023-12-21 21:07:46 +01:00
//void * ptr = hbw_malloc(size);
void * ptr ;
int result = hbw_posix_memalign ( & ptr , ggml_backend_cpu_buffer_type_get_alignment ( buft ) , size ) ;
if ( result ! = 0 ) {
fprintf ( stderr , " failed to allocate HBM buffer of size %zu \n " , size ) ;
return NULL ;
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr ( ptr , size ) ;
buffer - > buft = buft ;
2024-01-12 20:07:38 +01:00
buffer - > iface . get_name = ggml_backend_cpu_hbm_buffer_get_name ;
2023-12-21 21:07:46 +01:00
buffer - > iface . free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer ;
return buffer ;
}
2024-01-12 20:07:38 +01:00
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type ( void ) {
2023-12-21 21:07:46 +01:00
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ {
2024-01-12 20:07:38 +01:00
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name ,
2023-12-21 21:07:46 +01:00
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer ,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment ,
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
/* .get_max_size = */ NULL , // defaults to SIZE_MAX
2023-12-21 21:07:46 +01:00
/* .get_alloc_size = */ NULL , // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host ,
} ,
/* .context = */ NULL ,
} ;
return & ggml_backend_cpu_buffer_type_hbm ;
2023-10-08 19:19:14 +02:00
}
2023-12-21 21:07:46 +01:00
# endif
2023-10-08 19:19:14 +02:00
2023-12-07 21:26:54 +01:00
struct ggml_backend_cpu_context {
int n_threads ;
void * work_data ;
size_t work_size ;
2024-02-09 10:42:27 +01:00
ggml_abort_callback abort_callback ;
void * abort_callback_data ;
2023-12-07 21:26:54 +01:00
} ;
2023-10-08 19:19:14 +02:00
2024-01-16 12:16:33 +01:00
GGML_CALL static const char * ggml_backend_cpu_name ( ggml_backend_t backend ) {
2023-12-07 21:26:54 +01:00
return " CPU " ;
2023-10-08 19:19:14 +02:00
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( backend ) ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_free ( ggml_backend_t backend ) {
2023-12-07 21:26:54 +01:00
struct ggml_backend_cpu_context * cpu_ctx = ( struct ggml_backend_cpu_context * ) backend - > context ;
free ( cpu_ctx - > work_data ) ;
free ( cpu_ctx ) ;
free ( backend ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type ( ggml_backend_t backend ) {
2023-12-07 21:26:54 +01:00
return ggml_backend_cpu_buffer_type ( ) ;
2023-10-08 19:19:14 +02:00
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( backend ) ;
2023-10-08 19:19:14 +02:00
}
struct ggml_backend_plan_cpu {
struct ggml_cplan cplan ;
struct ggml_cgraph cgraph ;
} ;
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create ( ggml_backend_t backend , const struct ggml_cgraph * cgraph ) {
2023-10-08 19:19:14 +02:00
struct ggml_backend_cpu_context * cpu_ctx = ( struct ggml_backend_cpu_context * ) backend - > context ;
struct ggml_backend_plan_cpu * cpu_plan = malloc ( sizeof ( struct ggml_backend_plan_cpu ) ) ;
cpu_plan - > cplan = ggml_graph_plan ( cgraph , cpu_ctx - > n_threads ) ;
2023-12-21 21:07:46 +01:00
cpu_plan - > cgraph = * cgraph ; // FIXME: deep copy
2023-10-08 19:19:14 +02:00
if ( cpu_plan - > cplan . work_size > 0 ) {
cpu_plan - > cplan . work_data = malloc ( cpu_plan - > cplan . work_size ) ;
2024-03-18 11:03:04 +01:00
if ( cpu_plan - > cplan . work_data = = NULL ) {
free ( cpu_plan ) ;
return NULL ;
}
2023-10-08 19:19:14 +02:00
}
2024-02-09 10:42:27 +01:00
cpu_plan - > cplan . abort_callback = cpu_ctx - > abort_callback ;
cpu_plan - > cplan . abort_callback_data = cpu_ctx - > abort_callback_data ;
2023-10-08 19:19:14 +02:00
return cpu_plan ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static void ggml_backend_cpu_graph_plan_free ( ggml_backend_t backend , ggml_backend_graph_plan_t plan ) {
2023-10-08 19:19:14 +02:00
struct ggml_backend_plan_cpu * cpu_plan = ( struct ggml_backend_plan_cpu * ) plan ;
free ( cpu_plan - > cplan . work_data ) ;
free ( cpu_plan ) ;
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( backend ) ;
2023-10-08 19:19:14 +02:00
}
2024-03-04 10:05:42 +01:00
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute ( ggml_backend_t backend , ggml_backend_graph_plan_t plan ) {
2023-10-08 19:19:14 +02:00
struct ggml_backend_plan_cpu * cpu_plan = ( struct ggml_backend_plan_cpu * ) plan ;
2024-03-04 10:05:42 +01:00
return ggml_graph_compute ( & cpu_plan - > cgraph , & cpu_plan - > cplan ) ;
2023-10-08 19:19:14 +02:00
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( backend ) ;
2023-10-08 19:19:14 +02:00
}
2024-03-04 10:05:42 +01:00
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute ( ggml_backend_t backend , struct ggml_cgraph * cgraph ) {
2023-10-08 19:19:14 +02:00
struct ggml_backend_cpu_context * cpu_ctx = ( struct ggml_backend_cpu_context * ) backend - > context ;
struct ggml_cplan cplan = ggml_graph_plan ( cgraph , cpu_ctx - > n_threads ) ;
if ( cpu_ctx - > work_size < cplan . work_size ) {
2024-03-13 18:54:21 +01:00
free ( cpu_ctx - > work_data ) ;
cpu_ctx - > work_data = malloc ( cplan . work_size ) ;
if ( cpu_ctx - > work_data = = NULL ) {
cpu_ctx - > work_size = 0 ;
return GGML_STATUS_ALLOC_FAILED ;
}
2023-10-08 19:19:14 +02:00
cpu_ctx - > work_size = cplan . work_size ;
}
cplan . work_data = cpu_ctx - > work_data ;
2024-02-09 10:42:27 +01:00
cplan . abort_callback = cpu_ctx - > abort_callback ;
cplan . abort_callback_data = cpu_ctx - > abort_callback_data ;
2024-03-04 10:05:42 +01:00
return ggml_graph_compute ( cgraph , & cplan ) ;
2023-10-08 19:19:14 +02:00
}
2024-01-16 12:16:33 +01:00
GGML_CALL static bool ggml_backend_cpu_supports_op ( ggml_backend_t backend , const struct ggml_tensor * op ) {
2023-12-29 09:32:31 +01:00
switch ( op - > op ) {
2024-01-17 17:54:56 +01:00
case GGML_OP_CPY :
2024-04-21 15:47:57 +02:00
return
op - > type ! = GGML_TYPE_IQ2_XXS & &
op - > type ! = GGML_TYPE_IQ2_XS & &
op - > type ! = GGML_TYPE_IQ1_S & &
op - > type ! = GGML_TYPE_IQ1_M ; // missing type_traits.from_float
2023-12-29 09:32:31 +01:00
case GGML_OP_MUL_MAT :
return op - > src [ 1 ] - > type = = GGML_TYPE_F32 | | op - > src [ 1 ] - > type = = ggml_internal_get_type_traits ( op - > src [ 0 ] - > type ) . vec_dot_type ;
default :
return true ;
}
2023-12-07 21:26:54 +01:00
GGML_UNUSED ( backend ) ;
2023-10-08 19:19:14 +02:00
}
2024-06-13 03:11:35 +02:00
GGML_CALL static bool ggml_backend_cpu_supports_buft ( ggml_backend_t backend , ggml_backend_buffer_type_t buft ) {
return ggml_backend_buft_is_host ( buft ) ;
GGML_UNUSED ( backend ) ;
}
2023-10-08 19:19:14 +02:00
static struct ggml_backend_i cpu_backend_i = {
2023-12-07 21:26:54 +01:00
/* .get_name = */ ggml_backend_cpu_name ,
/* .free = */ ggml_backend_cpu_free ,
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type ,
/* .set_tensor_async = */ NULL ,
/* .get_tensor_async = */ NULL ,
2024-01-12 20:07:38 +01:00
/* .cpy_tensor_async = */ NULL ,
2023-12-07 21:26:54 +01:00
/* .synchronize = */ NULL ,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create ,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free ,
2024-06-13 03:11:35 +02:00
/* .graph_plan_update = */ NULL ,
2023-12-07 21:26:54 +01:00
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute ,
/* .graph_compute = */ ggml_backend_cpu_graph_compute ,
/* .supports_op = */ ggml_backend_cpu_supports_op ,
2024-06-13 03:11:35 +02:00
/* .supports_buft = */ ggml_backend_cpu_supports_buft ,
2024-03-18 11:03:04 +01:00
/* .offload_op = */ NULL ,
2024-03-13 18:54:21 +01:00
/* .event_new = */ NULL ,
/* .event_free = */ NULL ,
/* .event_record = */ NULL ,
/* .event_wait = */ NULL ,
/* .event_synchronize = */ NULL ,
2023-10-08 19:19:14 +02:00
} ;
2024-02-24 17:27:36 +01:00
static ggml_guid_t ggml_backend_cpu_guid ( void ) {
static ggml_guid guid = { 0xaa , 0x67 , 0xc7 , 0x43 , 0x96 , 0xe6 , 0xa3 , 0x8a , 0xe3 , 0xaf , 0xea , 0x92 , 0x36 , 0xbc , 0xfc , 0x89 } ;
return & guid ;
}
2023-10-08 19:19:14 +02:00
ggml_backend_t ggml_backend_cpu_init ( void ) {
struct ggml_backend_cpu_context * ctx = malloc ( sizeof ( struct ggml_backend_cpu_context ) ) ;
2024-02-12 08:16:06 +01:00
if ( ctx = = NULL ) {
return NULL ;
}
2023-10-08 19:19:14 +02:00
2024-02-09 10:42:27 +01:00
ctx - > n_threads = GGML_DEFAULT_N_THREADS ;
ctx - > work_data = NULL ;
ctx - > work_size = 0 ;
ctx - > abort_callback = NULL ;
ctx - > abort_callback_data = NULL ;
2023-10-08 19:19:14 +02:00
ggml_backend_t cpu_backend = malloc ( sizeof ( struct ggml_backend ) ) ;
2024-02-12 08:16:06 +01:00
if ( cpu_backend = = NULL ) {
free ( ctx ) ;
return NULL ;
}
2023-10-08 19:19:14 +02:00
* cpu_backend = ( struct ggml_backend ) {
2024-02-24 17:27:36 +01:00
/* .guid = */ ggml_backend_cpu_guid ( ) ,
2023-10-08 19:19:14 +02:00
/* .interface = */ cpu_backend_i ,
/* .context = */ ctx
} ;
return cpu_backend ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL bool ggml_backend_is_cpu ( ggml_backend_t backend ) {
2024-02-24 17:27:36 +01:00
return backend ! = NULL & & ggml_guid_matches ( backend - > guid , ggml_backend_cpu_guid ( ) ) ;
2023-10-08 19:19:14 +02:00
}
void ggml_backend_cpu_set_n_threads ( ggml_backend_t backend_cpu , int n_threads ) {
GGML_ASSERT ( ggml_backend_is_cpu ( backend_cpu ) ) ;
struct ggml_backend_cpu_context * ctx = ( struct ggml_backend_cpu_context * ) backend_cpu - > context ;
ctx - > n_threads = n_threads ;
}
2024-02-09 10:42:27 +01:00
void ggml_backend_cpu_set_abort_callback ( ggml_backend_t backend_cpu , ggml_abort_callback abort_callback , void * abort_callback_data ) {
GGML_ASSERT ( ggml_backend_is_cpu ( backend_cpu ) ) ;
struct ggml_backend_cpu_context * ctx = ( struct ggml_backend_cpu_context * ) backend_cpu - > context ;
ctx - > abort_callback = abort_callback ;
ctx - > abort_callback_data = abort_callback_data ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr ( void * ptr , size_t size ) {
2024-02-12 08:16:06 +01:00
GGML_ASSERT ( ( uintptr_t ) ptr % TENSOR_ALIGNMENT = = 0 & & " buffer pointer must be aligned " ) ;
2023-12-07 21:26:54 +01:00
return ggml_backend_buffer_init ( ggml_backend_cpu_buffer_type ( ) , cpu_backend_buffer_i_from_ptr , ptr , size ) ;
}
2024-01-16 12:16:33 +01:00
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init ( const char * params , void * user_data ) {
2023-12-07 21:26:54 +01:00
return ggml_backend_cpu_init ( ) ;
GGML_UNUSED ( params ) ;
GGML_UNUSED ( user_data ) ;
2023-10-08 19:19:14 +02:00
}
2023-11-13 13:16:23 +01:00
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
// multi-buffer buffer
struct ggml_backend_multi_buffer_context {
ggml_backend_buffer_t * buffers ;
size_t n_buffers ;
} ;
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t ;
GGML_CALL static const char * ggml_backend_multi_buffer_get_name ( ggml_backend_buffer_t buffer ) {
ggml_backend_multi_buffer_context_t ctx = ( ggml_backend_multi_buffer_context_t ) buffer - > context ;
return ctx - > buffers [ 0 ] - > iface . get_name ( ctx - > buffers [ 0 ] ) ;
}
GGML_CALL static void ggml_backend_multi_buffer_free_buffer ( ggml_backend_buffer_t buffer ) {
ggml_backend_multi_buffer_context_t ctx = ( ggml_backend_multi_buffer_context_t ) buffer - > context ;
for ( size_t i = 0 ; i < ctx - > n_buffers ; i + + ) {
ggml_backend_buffer_free ( ctx - > buffers [ i ] ) ;
}
free ( ctx - > buffers ) ;
free ( ctx ) ;
}
GGML_CALL static void ggml_backend_multi_buffer_clear ( ggml_backend_buffer_t buffer , uint8_t value ) {
ggml_backend_multi_buffer_context_t ctx = ( ggml_backend_multi_buffer_context_t ) buffer - > context ;
for ( size_t i = 0 ; i < ctx - > n_buffers ; i + + ) {
ggml_backend_buffer_clear ( ctx - > buffers [ i ] , value ) ;
}
}
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface ( void ) {
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
/* .get_name = */ ggml_backend_multi_buffer_get_name ,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer ,
/* .get_base = */ NULL ,
/* .init_tensor = */ NULL ,
/* .set_tensor = */ NULL ,
/* .get_tensor = */ NULL ,
/* .cpy_tensor = */ NULL ,
/* .clear = */ ggml_backend_multi_buffer_clear ,
/* .reset = */ NULL ,
} ;
return multi_backend_buffer_i ;
}
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer ( ggml_backend_buffer_t * buffers , size_t n_buffers ) {
ggml_backend_multi_buffer_context_t ctx = ( ggml_backend_multi_buffer_context_t ) malloc ( sizeof ( struct ggml_backend_multi_buffer_context ) ) ;
ctx - > n_buffers = n_buffers ;
ctx - > buffers = ( ggml_backend_buffer_t * ) malloc ( n_buffers * sizeof ( ggml_backend_buffer_t ) ) ;
2024-02-12 08:16:06 +01:00
GGML_ASSERT ( ctx - > buffers ! = NULL ) ;
ggml : add Vulkan backend (#2059)
* Vulkan loader code
* Fix matmul kernel, continue implementation
* Continue implementation
* Vulkan memory management
* Vulkan development
* Matmul call
* Add aligned malloc and free for VMA
* Continue implementation
* First matmul success
* GEMM Kernel optimization
* 1D Blocktiling
* 2D Blocktiling
* Write coalescing
* Continue vulkan implementation and optimization
* First FP16 attempt, disabled for now
* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel
* Enable device extensions properly, restore fp16 matmul op
* Fix mulmat_f16
* Output FP32 in fp16 matmul shader
* Fix f16_to_f32 kernel
* dequant_q4_0 kernel
* Add VMA library
* Avoid requesting dedicated memory, VMA can decide that by itself
* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly
* add cmake commands
* Add 2d write operation, profiling code
* Fix 2d write
* Fix queue selection for AMD RADV
* Fix trailing whitespace in vk_mem_alloc.h
* Add WIP warp tile mat mul shaders
* Disable glslc optimization
* Disable glslc optimization for CMake
* Optimize warptile matmul shader, replace blocktile with it
* Add split-k optimization for small matrix multiplication
Use semaphores for synchronization instead of fences or waitidle
Rework async write/read for synchronization
* Fix validation errors, improve compatibility with AMD GPUs
* Rework command buffer handling
* Variable matmul kernel using specialization constants
* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
* Reuse semaphores
* Handle stage flags during command buffer submission properly
* Increase matmul test runs for consistent results
* Fix F32 matmul
* Add vectorized loading and zeropadding for matrix multiplication
* Use pinned memory for f16 preprocessing
* Don't force aligned matmul
* Don't free before queue done
* Replace VMA library with native Vulkan buffer management
* Basic offloading support with mul_f32 and dmmv for q4_0
* Run glslc commands in parallel
* Unroll loops in dmmv shader
* Reduce usage of waitIdle
* Reuse pinned allocation for f16 conversion
* Handle devices with only a single queue
* Fix trailing whitespace in CMakeLists.txt
* Allow parallel execution of kernels, parallelize third and fourth dimension calls
* Add fallback for devices only supporting one DescriptorSet per DescriptorPool
* Move to graph function similar to CUDA implementation
* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function
* Add F32 dmmv shaders
* Batch submissions
* Add .spv to gitignore
* Split off matrix vector multiplication for separate optimization
* Use single command buffer for matrix vector multiplication ops
* Reduce overhead of mul_f32 calls by using a single command buffer
* Add submission batching to mul_f32
* Fix tests
* Add missing barrier
* Add further missing barrier
* Add further ops
* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions
* Remove unnecessary cblas link
* Fix descriptor set pre-allocation assert
* Add runtime shader compilation, start transferring shaders to this approach
* Transfer remaining shaders to header and compile on runtime
* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16
* Add support for q4_1, q5_0, q5_1 and q8_0
* Remove unnecessary scalar layout extension
* Parse graph early to pre-record command buffers
* Add q6_k support
* Add multi-submit for command buffers
* Fix q6_k dequant shader for AMD
* Fix q6_k for GPUs without fp16 support
* Simplify q6_k fp16 fix
* Minor fixes
* Fix wg_denom of m-mulmat shaders
* Add Python-based Vulkan shader generator
* Replace shaderc dependency with precompiled shaders
Fix python script to generate shaders
* Clean up code
* Fix shader generator script Windows compatibility
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
* Close file before deletion
* Fix vulkan shader fp32 name
* Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
* Add q4_k support
* Add q5_k support
* Bake SPIR-V bytecode into the library instead of loading shaders from file
* Switch to signal semaphores for flexibility
Prepare broadcasting support for mul mat
* Finish broadcasting mul mat support for GQA
* Clean up unused functions
Add repeat op
* Add further ops, not yet enabled. Improve semaphore code
* Reduce number of used semaphores by utilizing timelines more properly
* Remove queue information
* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations
* Add Vulkan to llama-bench
* Remove cblas dependency
* Fix matmul k-split bug
* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader
* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug
* Fix issues with float16 overflows in shaders
* Fix issues with older Vulkan headers on Ubuntu 22.04
* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers
* Implement further ops, rework op_f32 calls, fix bugs
* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code
* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders
* Merge upstream changes, fix conflicts, adapt soft_max op
* Fix Python and shader header format
* Free model gpu buffers on exit
* Use single queue per device to simplify code
* Add matmul shader support for running multiple calculations in parallel
* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible
* Fix missing event cast
* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity
* Fix warning about empty C function parameters
* Fix compiler warnings
* Properly implement Vulkan backend buffer handling
* Fix oversized host staging buffers
* Simplify barrier synchronization calls
* Fix gcc warnings
* Implement max_size for backend buffer types to limit the size of a single allocation
* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size
* refactor multi buf
* Disable unsupported ops to fix tests
* Check for maintenance4 support before using it
* Handle devices with only a single queue
* Fix single queue logic
* propagate buffer usage in multi buffers
* Implement rope_neox op
* Cleanup header and other files
* Simplify gpu_extras by removing events and putting staging memcpys into contexts
* Move queue into context
Add not-yet-enabled async backend ops
* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization
* Add get_max_size to SYCL backend.
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* llama : fix trailing whitespace
---------
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-28 18:03:59 +01:00
size_t total_size = 0 ;
for ( size_t i = 0 ; i < n_buffers ; i + + ) {
ctx - > buffers [ i ] = buffers [ i ] ;
total_size + = ggml_backend_buffer_get_size ( buffers [ i ] ) ;
}
return ggml_backend_buffer_init ( buffers [ 0 ] - > buft , ggml_backend_multi_buffer_context_interface ( ) , ctx , total_size ) ;
}
GGML_CALL bool ggml_backend_buffer_is_multi_buffer ( ggml_backend_buffer_t buffer ) {
return buffer - > iface . get_name = = ggml_backend_multi_buffer_get_name ;
}
GGML_CALL void ggml_backend_multi_buffer_set_usage ( ggml_backend_buffer_t buffer , enum ggml_backend_buffer_usage usage ) {
GGML_ASSERT ( ggml_backend_buffer_is_multi_buffer ( buffer ) ) ;
ggml_backend_multi_buffer_context_t ctx = ( ggml_backend_multi_buffer_context_t ) buffer - > context ;
for ( size_t i = 0 ; i < ctx - > n_buffers ; i + + ) {
ggml_backend_buffer_set_usage ( ctx - > buffers [ i ] , usage ) ;
}
}
2024-02-12 08:16:06 +01:00
// creates a copy of the tensor with the same memory layout
static struct ggml_tensor * ggml_dup_tensor_layout ( struct ggml_context * ctx , const struct ggml_tensor * tensor ) {
struct ggml_tensor * dup = ggml_dup_tensor ( ctx , tensor ) ;
for ( int i = 0 ; i < GGML_MAX_DIMS ; i + + ) {
dup - > nb [ i ] = tensor - > nb [ i ] ;
}
return dup ;
}
static bool ggml_is_view_op ( enum ggml_op op ) {
return op = = GGML_OP_VIEW | | op = = GGML_OP_RESHAPE | | op = = GGML_OP_PERMUTE | | op = = GGML_OP_TRANSPOSE ;
}
2023-12-07 21:26:54 +01:00
2023-11-13 13:16:23 +01:00
// scheduler
2024-03-13 18:54:21 +01:00
# ifndef GGML_SCHED_MAX_BACKENDS
# define GGML_SCHED_MAX_BACKENDS 16
# endif
# ifndef GGML_SCHED_MAX_SPLITS
2024-03-18 11:03:04 +01:00
# define GGML_SCHED_MAX_SPLITS 2048
2024-03-13 18:54:21 +01:00
# endif
# ifndef GGML_SCHED_MAX_SPLIT_INPUTS
2024-03-18 16:33:44 +01:00
# define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
2024-03-13 18:54:21 +01:00
# endif
# ifndef GGML_SCHED_MAX_COPIES
# define GGML_SCHED_MAX_COPIES 4
# endif
2023-11-13 13:16:23 +01:00
struct ggml_backend_sched_split {
2024-02-12 08:16:06 +01:00
int backend_id ;
2023-11-13 13:16:23 +01:00
int i_start ;
int i_end ;
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inputs [ GGML_SCHED_MAX_SPLIT_INPUTS ] ;
2023-11-13 13:16:23 +01:00
int n_inputs ;
2024-01-12 20:07:38 +01:00
// graph view of this split
2023-12-07 21:26:54 +01:00
struct ggml_cgraph graph ;
2023-11-13 13:16:23 +01:00
} ;
struct ggml_backend_sched {
2024-01-12 20:07:38 +01:00
bool is_reset ; // true if the scheduler has been reset since the last graph split
2024-03-13 18:54:21 +01:00
bool is_alloc ;
2024-01-12 20:07:38 +01:00
2023-11-13 13:16:23 +01:00
int n_backends ;
2024-03-13 18:54:21 +01:00
ggml_backend_t backends [ GGML_SCHED_MAX_BACKENDS ] ;
ggml_backend_buffer_type_t bufts [ GGML_SCHED_MAX_BACKENDS ] ;
2023-11-13 13:16:23 +01:00
ggml_gallocr_t galloc ;
2024-01-12 20:07:38 +01:00
// hash keys of the nodes in the graph
2023-11-13 13:16:23 +01:00
struct ggml_hash_set hash_set ;
2024-02-12 08:16:06 +01:00
// hash values
int * tensor_backend_id ;
2024-03-13 18:54:21 +01:00
struct ggml_tensor * ( * tensor_copies ) [ GGML_SCHED_MAX_BACKENDS ] [ GGML_SCHED_MAX_COPIES ] ;
2024-02-12 08:16:06 +01:00
2024-03-13 18:54:21 +01:00
int * node_backend_ids ; // [graph_size]
int * leaf_backend_ids ; // [graph_size]
2023-11-13 13:16:23 +01:00
2024-06-13 03:11:35 +02:00
int * prev_node_backend_ids ; // [graph_size]
int * prev_leaf_backend_ids ; // [graph_size]
2024-01-12 20:07:38 +01:00
// copy of the graph with modified inputs
2023-11-13 13:16:23 +01:00
struct ggml_cgraph * graph ;
2024-01-12 20:07:38 +01:00
2024-03-13 18:54:21 +01:00
// graph splits
2024-03-18 11:03:04 +01:00
struct ggml_backend_sched_split * splits ;
2023-11-13 13:16:23 +01:00
int n_splits ;
2024-03-18 11:03:04 +01:00
int splits_capacity ;
2023-11-13 13:16:23 +01:00
2024-03-13 18:54:21 +01:00
// pipeline parallelism support
int n_copies ;
int cur_copy ;
ggml_backend_event_t events [ GGML_SCHED_MAX_BACKENDS ] [ GGML_SCHED_MAX_COPIES ] ;
struct ggml_tensor * graph_inputs [ GGML_SCHED_MAX_SPLIT_INPUTS ] ;
int n_graph_inputs ;
2023-11-13 13:16:23 +01:00
struct ggml_context * ctx ;
2024-02-12 08:16:06 +01:00
ggml_backend_sched_eval_callback callback_eval ;
void * callback_eval_user_data ;
2024-06-13 03:11:35 +02:00
bool debug ;
2023-11-13 13:16:23 +01:00
// align context_buffer to GGML_MEM_ALIGN
2024-03-13 18:54:21 +01:00
# ifdef _MSC_VER
2023-11-13 13:16:23 +01:00
__declspec ( align ( GGML_MEM_ALIGN ) )
2024-03-13 18:54:21 +01:00
# else
2023-11-13 13:16:23 +01:00
__attribute__ ( ( aligned ( GGML_MEM_ALIGN ) ) )
2024-03-13 18:54:21 +01:00
# endif
char context_buffer [ GGML_SCHED_MAX_SPLITS * GGML_SCHED_MAX_SPLIT_INPUTS * 2 * sizeof ( struct ggml_tensor ) + sizeof ( struct ggml_cgraph ) ] ;
2023-11-13 13:16:23 +01:00
} ;
2024-03-13 18:54:21 +01:00
# define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
# define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
2023-11-13 13:16:23 +01:00
2024-02-12 08:16:06 +01:00
// returns the priority of the backend, lower id is higher priority
static int ggml_backend_sched_backend_id ( ggml_backend_sched_t sched , ggml_backend_t backend ) {
2023-11-13 13:16:23 +01:00
for ( int i = 0 ; i < sched - > n_backends ; i + + ) {
if ( sched - > backends [ i ] = = backend ) {
return i ;
}
}
2024-02-12 08:16:06 +01:00
return - 1 ;
2023-11-13 13:16:23 +01:00
}
2024-06-13 03:11:35 +02:00
static int ggml_backend_sched_backend_from_buffer ( ggml_backend_sched_t sched , const struct ggml_tensor * tensor , const struct ggml_tensor * op ) {
2024-03-13 18:54:21 +01:00
ggml_backend_buffer_t buffer = tensor - > buffer ;
2023-12-07 21:26:54 +01:00
if ( buffer = = NULL ) {
2024-02-12 08:16:06 +01:00
return - 1 ;
2024-01-12 20:07:38 +01:00
}
2024-06-13 03:11:35 +02:00
// find highest prio backend that supports the buffer type and the op
2023-12-07 21:26:54 +01:00
for ( int i = 0 ; i < sched - > n_backends ; i + + ) {
2024-06-13 03:11:35 +02:00
if ( ggml_backend_supports_buft ( sched - > backends [ i ] , buffer - > buft ) & &
ggml_backend_supports_op ( sched - > backends [ i ] , op ) ) {
2024-02-12 08:16:06 +01:00
return i ;
2023-12-07 21:26:54 +01:00
}
}
2024-03-13 18:54:21 +01:00
2024-06-13 03:11:35 +02:00
# ifndef NDEBUG
fprintf ( stderr , " %s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied \n " ,
__func__ , ggml_op_desc ( tensor ) , ggml_backend_buffer_name ( buffer ) , tensor - > name ) ;
# endif
2024-03-13 18:54:21 +01:00
return - 1 ;
2023-12-07 21:26:54 +01:00
}
#if 0
2024-03-13 18:54:21 +01:00
static char causes [ GGML_DEFAULT_GRAPH_SIZE * 16 + GGML_SCHED_MAX_SPLITS * GGML_SCHED_MAX_SPLIT_INPUTS ] [ 128 ] ; // debug only
2023-12-07 21:26:54 +01:00
# define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
# define GET_CAUSE(node) causes[hash_id(node)]
# else
# define SET_CAUSE(node, ...)
# define GET_CAUSE(node) ""
# endif
2023-11-13 13:16:23 +01:00
// returns the backend that should be used for the node based on the current locations
2024-02-12 08:16:06 +01:00
static int ggml_backend_sched_backend_id_from_cur ( ggml_backend_sched_t sched , struct ggml_tensor * tensor ) {
// TODO: use supports_op to check if the backend supports the op
2024-01-12 20:07:38 +01:00
// assign pre-allocated nodes to their backend
2024-06-13 03:11:35 +02:00
int cur_backend_id = ggml_backend_sched_backend_from_buffer ( sched , tensor , tensor ) ;
2024-03-18 11:03:04 +01:00
if ( cur_backend_id ! = - 1 ) {
2024-03-13 18:54:21 +01:00
SET_CAUSE ( tensor , " 1.dst " ) ;
2024-03-18 11:03:04 +01:00
return cur_backend_id ;
2023-11-13 13:16:23 +01:00
}
2024-03-13 18:54:21 +01:00
2023-11-13 13:16:23 +01:00
// view_src
2024-02-12 08:16:06 +01:00
if ( tensor - > view_src ! = NULL ) {
2024-06-13 03:11:35 +02:00
cur_backend_id = ggml_backend_sched_backend_from_buffer ( sched , tensor - > view_src , tensor ) ;
2024-03-18 11:03:04 +01:00
if ( cur_backend_id ! = - 1 ) {
2024-03-13 18:54:21 +01:00
SET_CAUSE ( tensor , " 1.vsrc " ) ;
2024-03-18 11:03:04 +01:00
return cur_backend_id ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
}
2024-03-13 18:54:21 +01:00
2024-03-18 11:03:04 +01:00
// graph input
2024-03-13 18:54:21 +01:00
if ( tensor - > flags & GGML_TENSOR_FLAG_INPUT ) {
2024-03-18 11:03:04 +01:00
cur_backend_id = sched - > n_backends - 1 ; // last backend (assumed CPU)
2024-03-13 18:54:21 +01:00
SET_CAUSE ( tensor , " 1.inp " ) ;
2024-03-18 11:03:04 +01:00
return cur_backend_id ;
2024-03-13 18:54:21 +01:00
}
2024-01-12 20:07:38 +01:00
// assign nodes that use weights to the backend of the weights
2024-03-18 11:03:04 +01:00
// operations with weights are preferably run on the same backend as the weights
2023-11-13 13:16:23 +01:00
for ( int i = 0 ; i < GGML_MAX_SRC ; i + + ) {
2024-02-12 08:16:06 +01:00
const struct ggml_tensor * src = tensor - > src [ i ] ;
2023-11-13 13:16:23 +01:00
if ( src = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2023-11-13 13:16:23 +01:00
}
2024-01-12 20:07:38 +01:00
if ( src - > buffer ! = NULL & & src - > buffer - > usage = = GGML_BACKEND_BUFFER_USAGE_WEIGHTS ) {
2024-06-13 03:11:35 +02:00
int src_backend_id = ggml_backend_sched_backend_from_buffer ( sched , src , tensor ) ;
2024-03-18 11:03:04 +01:00
// check if a backend with higher prio wants to offload the op
if ( src_backend_id = = sched - > n_backends - 1 ) {
for ( int b = 0 ; b < src_backend_id ; b + + ) {
2024-06-17 16:51:42 +02:00
if ( ggml_backend_supports_op ( sched - > backends [ b ] , tensor ) & & ggml_backend_offload_op ( sched - > backends [ b ] , tensor ) ) {
2024-03-18 11:03:04 +01:00
SET_CAUSE ( tensor , " 1.off " ) ;
return b ;
}
}
}
2024-03-13 18:54:21 +01:00
SET_CAUSE ( tensor , " 1.wgt%d " , i ) ;
2024-03-18 11:03:04 +01:00
return src_backend_id ;
2023-11-13 13:16:23 +01:00
}
}
2024-01-12 20:07:38 +01:00
2024-02-12 08:16:06 +01:00
return - 1 ;
2023-11-13 13:16:23 +01:00
}
static char * fmt_size ( size_t size ) {
static char buffer [ 128 ] ;
if ( size > = 1024 * 1024 ) {
2024-04-25 16:24:07 +02:00
snprintf ( buffer , sizeof ( buffer ) , " %zuM " , size / 1024 / 1024 ) ;
2023-11-13 13:16:23 +01:00
} else {
2024-04-25 16:24:07 +02:00
snprintf ( buffer , sizeof ( buffer ) , " %zuK " , size / 1024 ) ;
2023-11-13 13:16:23 +01:00
}
return buffer ;
}
2024-02-12 08:16:06 +01:00
static void ggml_backend_sched_print_assignments ( ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
2023-11-13 13:16:23 +01:00
int cur_split = 0 ;
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
if ( cur_split < sched - > n_splits & & i = = sched - > splits [ cur_split ] . i_start ) {
2024-02-12 08:16:06 +01:00
ggml_backend_t split_backend = sched - > backends [ sched - > splits [ cur_split ] . backend_id ] ;
2023-12-07 21:26:54 +01:00
fprintf ( stderr , " \n ## SPLIT #%d: %s # %d inputs: " , cur_split , ggml_backend_name ( split_backend ) ,
sched - > splits [ cur_split ] . n_inputs ) ;
2023-11-13 13:16:23 +01:00
for ( int j = 0 ; j < sched - > splits [ cur_split ] . n_inputs ; j + + ) {
2023-12-07 21:26:54 +01:00
fprintf ( stderr , " [%s (%5.5s)] " , sched - > splits [ cur_split ] . inputs [ j ] - > name ,
fmt_size ( ggml_nbytes ( sched - > splits [ cur_split ] . inputs [ j ] ) ) ) ;
2023-11-13 13:16:23 +01:00
}
fprintf ( stderr , " \n " ) ;
cur_split + + ;
}
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2024-03-13 18:54:21 +01:00
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend ( sched , node ) ;
2024-01-12 20:07:38 +01:00
fprintf ( stderr , " node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]: " , i , ggml_op_name ( node - > op ) , node - > name ,
2024-02-12 08:16:06 +01:00
fmt_size ( ggml_nbytes ( node ) ) , tensor_backend ? ggml_backend_name ( tensor_backend ) : " NULL " , GET_CAUSE ( node ) ) ;
2023-11-13 13:16:23 +01:00
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2023-11-13 13:16:23 +01:00
}
2024-03-13 18:54:21 +01:00
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend ( sched , src ) ;
2024-01-12 20:07:38 +01:00
fprintf ( stderr , " %20.20s (%5.5s) [%5.5s %8.8s] " , src - > name ,
2023-12-07 21:26:54 +01:00
fmt_size ( ggml_nbytes ( src ) ) , src_backend ? ggml_backend_name ( src_backend ) : " NULL " , GET_CAUSE ( src ) ) ;
2023-11-13 13:16:23 +01:00
}
fprintf ( stderr , " \n " ) ;
}
}
2024-06-13 03:11:35 +02:00
static bool ggml_backend_sched_buffer_supported ( ggml_backend_sched_t sched , struct ggml_tensor * t , int backend_id ) {
ggml_backend_buffer_t buf = t - > view_src ? t - > view_src - > buffer : t - > buffer ;
ggml_backend_buffer_type_t buft = NULL ;
if ( buf ) {
// the tensor is already allocated
buft = buf - > buft ;
} else {
// see if the tensor already has a backend assigned, and use the buffer type of that backend
int tensor_backend_id = tensor_backend_id ( t ) ;
if ( tensor_backend_id = = - 1 & & t - > view_src ) {
tensor_backend_id = tensor_backend_id ( t - > view_src ) ;
}
if ( tensor_backend_id ! = - 1 ) {
buft = sched - > bufts [ tensor_backend_id ] ;
}
}
return buft ! = NULL & & ggml_backend_supports_buft ( sched - > backends [ backend_id ] , buft ) ;
}
static void ggml_backend_sched_set_if_supported ( ggml_backend_sched_t sched , struct ggml_tensor * node , int cur_backend_id , int * node_backend_id ) {
if ( ggml_backend_supports_op ( sched - > backends [ cur_backend_id ] , node ) ) {
* node_backend_id = cur_backend_id ;
SET_CAUSE ( node , " 2.sup " ) ;
}
}
2024-01-12 20:07:38 +01:00
2023-11-13 13:16:23 +01:00
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
2024-02-12 08:16:06 +01:00
static void ggml_backend_sched_split_graph ( ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
2024-01-12 20:07:38 +01:00
// reset splits
2023-11-13 13:16:23 +01:00
sched - > n_splits = 0 ;
2024-03-13 18:54:21 +01:00
sched - > n_graph_inputs = 0 ;
2024-01-12 20:07:38 +01:00
sched - > is_reset = false ;
2023-11-13 13:16:23 +01:00
struct ggml_init_params params = {
2023-12-07 21:26:54 +01:00
/* .mem_size = */ sizeof ( sched - > context_buffer ) ,
/* .mem_buffer = */ sched - > context_buffer ,
/* .no_alloc = */ true
2023-11-13 13:16:23 +01:00
} ;
2024-01-12 20:07:38 +01:00
ggml_free ( sched - > ctx ) ;
2023-11-13 13:16:23 +01:00
sched - > ctx = ggml_init ( params ) ;
2024-01-12 20:07:38 +01:00
if ( sched - > ctx = = NULL ) {
fprintf ( stderr , " %s: failed to initialize context \n " , __func__ ) ;
GGML_ASSERT ( false ) ;
}
2023-11-13 13:16:23 +01:00
2024-01-12 20:07:38 +01:00
// pass 1: assign backends to ops with pre-allocated inputs
2023-11-13 13:16:23 +01:00
for ( int i = 0 ; i < graph - > n_leafs ; i + + ) {
struct ggml_tensor * leaf = graph - > leafs [ i ] ;
2024-03-18 11:03:04 +01:00
int * leaf_backend_id = & tensor_backend_id ( leaf ) ;
if ( * leaf_backend_id ! = - 1 ) {
2023-11-13 13:16:23 +01:00
// do not overwrite user assignments
continue ;
}
2024-03-18 11:03:04 +01:00
* leaf_backend_id = ggml_backend_sched_backend_id_from_cur ( sched , leaf ) ;
2023-11-13 13:16:23 +01:00
}
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
2024-03-18 11:03:04 +01:00
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id ! = - 1 ) {
2023-11-13 13:16:23 +01:00
// do not overwrite user assignments
continue ;
}
2024-03-18 11:03:04 +01:00
* node_backend_id = ggml_backend_sched_backend_id_from_cur ( sched , node ) ;
2024-01-12 20:07:38 +01:00
// src
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2024-01-12 20:07:38 +01:00
}
2024-03-18 11:03:04 +01:00
int * src_backend_id = & tensor_backend_id ( src ) ;
if ( * src_backend_id = = - 1 ) {
* src_backend_id = ggml_backend_sched_backend_id_from_cur ( sched , src ) ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
}
}
2024-01-12 20:07:38 +01:00
// pass 2: expand current backend assignments
// assign the same backend to adjacent nodes
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
2024-06-13 03:11:35 +02:00
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
// expand gpu down
2024-01-12 20:07:38 +01:00
{
2024-02-12 08:16:06 +01:00
int cur_backend_id = - 1 ;
2024-03-13 18:54:21 +01:00
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
2024-01-12 20:07:38 +01:00
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2024-03-18 11:03:04 +01:00
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id ! = - 1 ) {
if ( * node_backend_id = = sched - > n_backends - 1 ) {
2024-01-12 20:07:38 +01:00
// skip cpu (lowest prio backend)
2024-02-12 08:16:06 +01:00
cur_backend_id = - 1 ;
2024-01-12 20:07:38 +01:00
} else {
2024-03-18 11:03:04 +01:00
cur_backend_id = * node_backend_id ;
2023-11-13 13:16:23 +01:00
}
2024-06-13 03:11:35 +02:00
} else if ( cur_backend_id ! = - 1 ) {
ggml_backend_sched_set_if_supported ( sched , node , cur_backend_id , node_backend_id ) ;
2024-01-12 20:07:38 +01:00
}
}
}
2024-06-13 03:11:35 +02:00
// expand gpu up
2024-01-12 20:07:38 +01:00
{
2024-02-12 08:16:06 +01:00
int cur_backend_id = - 1 ;
2024-03-13 18:54:21 +01:00
for ( int i = graph - > n_nodes - 1 ; i > = 0 ; i - - ) {
2024-01-12 20:07:38 +01:00
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2024-03-18 11:03:04 +01:00
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id ! = - 1 ) {
if ( * node_backend_id = = sched - > n_backends - 1 ) {
2024-01-12 20:07:38 +01:00
// skip cpu (lowest prio backend)
2024-02-12 08:16:06 +01:00
cur_backend_id = - 1 ;
2024-01-12 20:07:38 +01:00
} else {
2024-03-18 11:03:04 +01:00
cur_backend_id = * node_backend_id ;
2023-11-13 13:16:23 +01:00
}
2024-06-13 03:11:35 +02:00
} else if ( cur_backend_id ! = - 1 ) {
ggml_backend_sched_set_if_supported ( sched , node , cur_backend_id , node_backend_id ) ;
2023-11-13 13:16:23 +01:00
}
2024-01-12 20:07:38 +01:00
}
}
2024-06-13 03:11:35 +02:00
// expand rest down
2024-01-12 20:07:38 +01:00
{
2024-02-12 08:16:06 +01:00
int cur_backend_id = - 1 ;
2024-03-13 18:54:21 +01:00
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
2024-01-12 20:07:38 +01:00
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2024-03-18 11:03:04 +01:00
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id ! = - 1 ) {
cur_backend_id = * node_backend_id ;
2024-06-13 03:11:35 +02:00
} else if ( cur_backend_id ! = - 1 ) {
ggml_backend_sched_set_if_supported ( sched , node , cur_backend_id , node_backend_id ) ;
2023-11-13 13:16:23 +01:00
}
}
}
2024-06-13 03:11:35 +02:00
// expand rest up
2024-01-12 20:38:34 +01:00
{
2024-02-12 08:16:06 +01:00
int cur_backend_id = - 1 ;
2024-03-13 18:54:21 +01:00
for ( int i = graph - > n_nodes - 1 ; i > = 0 ; i - - ) {
2024-01-12 20:38:34 +01:00
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2024-03-18 11:03:04 +01:00
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id ! = - 1 ) {
cur_backend_id = * node_backend_id ;
2024-06-13 03:11:35 +02:00
} else if ( cur_backend_id ! = - 1 ) {
ggml_backend_sched_set_if_supported ( sched , node , cur_backend_id , node_backend_id ) ;
2024-01-12 20:38:34 +01:00
}
}
}
2024-03-13 18:54:21 +01:00
2024-06-13 03:11:35 +02:00
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
// however, we also need to verify that the sources are in compatible buffer types
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
int * node_backend_id = & tensor_backend_id ( node ) ;
if ( * node_backend_id = = - 1 ) {
// unassigned node: find the backend with the most supported inputs
int n_supported_best = - 1 ;
for ( int b = 0 ; b < sched - > n_backends ; b + + ) {
if ( ggml_backend_supports_op ( sched - > backends [ b ] , node ) ) {
int n_supported = 0 ;
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
continue ;
}
if ( ( tensor_backend_id ( src ) ! = - 1 | | tensor_backend_id ( src - > view_src ) ! = - 1 ) & & ggml_backend_sched_buffer_supported ( sched , src , b ) ) {
n_supported + + ;
}
}
if ( n_supported > n_supported_best ) {
n_supported_best = n_supported ;
* node_backend_id = b ;
SET_CAUSE ( node , " 3.best " ) ;
}
}
}
} else {
// assigned node: upgrade to higher prio backend if possible
for ( int b = 0 ; b < * node_backend_id ; b + + ) {
if ( sched - > bufts [ b ] = = sched - > bufts [ * node_backend_id ] & & ggml_backend_supports_op ( sched - > backends [ b ] , node ) ) {
bool supported = true ;
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
continue ;
}
if ( ! ggml_backend_sched_buffer_supported ( sched , src , b ) ) {
supported = false ;
break ;
}
}
if ( supported ) {
* node_backend_id = b ;
SET_CAUSE ( node , " 3.upg " ) ;
break ;
}
}
}
}
}
2023-11-13 13:16:23 +01:00
2024-06-13 03:11:35 +02:00
// pass 4: assign backends to remaining src from dst and view_src
2023-11-13 13:16:23 +01:00
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
2024-03-18 11:03:04 +01:00
int * cur_backend_id = & tensor_backend_id ( node ) ;
if ( node - > view_src ! = NULL & & * cur_backend_id = = - 1 ) {
* cur_backend_id = tensor_backend_id ( node - > view_src ) ;
2024-06-13 03:11:35 +02:00
SET_CAUSE ( node , " 4.vsrc " ) ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2023-11-13 13:16:23 +01:00
}
2024-03-18 11:03:04 +01:00
int * src_backend_id = & tensor_backend_id ( src ) ;
if ( * src_backend_id = = - 1 ) {
2024-01-12 20:07:38 +01:00
if ( src - > view_src ! = NULL ) {
// views are always on the same backend as the source
2024-03-18 11:03:04 +01:00
* src_backend_id = tensor_backend_id ( src - > view_src ) ;
2024-06-13 03:11:35 +02:00
SET_CAUSE ( src , " 4.vsrc " ) ;
2024-01-12 20:07:38 +01:00
} else {
2024-03-18 11:03:04 +01:00
* src_backend_id = * cur_backend_id ;
2024-06-13 03:11:35 +02:00
SET_CAUSE ( src , " 4.cur " ) ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
}
}
}
// pass 4: split graph, find tensors that need to be copied
2024-01-12 20:07:38 +01:00
{
2024-03-18 11:03:04 +01:00
int i_split = 0 ;
struct ggml_backend_sched_split * split = & sched - > splits [ 0 ] ;
2024-01-12 20:07:38 +01:00
// find the backend of the first split, skipping view ops
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ! ggml_is_view_op ( node - > op ) ) {
2024-03-18 11:03:04 +01:00
split - > backend_id = tensor_backend_id ( node ) ;
2024-01-12 20:07:38 +01:00
break ;
}
2023-11-13 13:16:23 +01:00
}
2024-03-18 11:03:04 +01:00
split - > i_start = 0 ;
split - > n_inputs = 0 ;
memset ( split - > inputs , 0 , sizeof ( split - > inputs ) ) ; //HACK
int cur_backend_id = split - > backend_id ;
2024-01-12 20:07:38 +01:00
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
if ( ggml_is_view_op ( node - > op ) ) {
continue ;
}
2023-11-13 13:16:23 +01:00
2024-03-18 11:03:04 +01:00
const int node_backend_id = tensor_backend_id ( node ) ;
2024-01-12 20:07:38 +01:00
2024-03-18 11:03:04 +01:00
GGML_ASSERT ( node_backend_id ! = - 1 ) ; // all nodes should be assigned by now
2024-01-12 20:38:34 +01:00
2024-03-18 11:03:04 +01:00
// check if we should start a new split based on the sources of the current node
bool need_new_split = false ;
if ( node_backend_id = = cur_backend_id & & split - > n_inputs > 0 ) {
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
continue ;
}
// check if a weight is on a different backend
// by starting a new split, the memory of the previously offloaded weights can be reused
if ( src - > buffer ! = NULL & & src - > buffer - > usage = = GGML_BACKEND_BUFFER_USAGE_WEIGHTS ) {
int src_backend_id = tensor_backend_id ( src ) ;
if ( src_backend_id ! = - 1 & & src_backend_id ! = cur_backend_id ) {
need_new_split = true ;
break ;
}
}
// check if the split has too many inputs
2024-06-13 03:11:35 +02:00
// FIXME: count the number of inputs instead of only checking when full
2024-03-18 11:03:04 +01:00
if ( split - > n_inputs = = GGML_SCHED_MAX_SPLIT_INPUTS ) {
const size_t id = hash_id ( src ) ;
int src_backend_id = sched - > tensor_backend_id [ id ] ;
2024-06-13 03:11:35 +02:00
bool supported = ggml_backend_sched_buffer_supported ( sched , src , cur_backend_id ) ;
if ( src_backend_id ! = cur_backend_id & & sched - > tensor_copies [ hash_id ( src ) ] [ cur_backend_id ] [ 0 ] = = NULL & & ! supported ) {
2024-03-18 11:03:04 +01:00
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true ;
break ;
}
}
}
}
if ( node_backend_id ! = cur_backend_id | | need_new_split ) {
split - > i_end = i ;
i_split + + ;
if ( i_split > = sched - > splits_capacity ) {
sched - > splits_capacity * = 2 ;
sched - > splits = realloc ( sched - > splits , sched - > splits_capacity * sizeof ( struct ggml_backend_sched_split ) ) ;
GGML_ASSERT ( sched - > splits ! = NULL ) ;
}
GGML_ASSERT ( i_split < GGML_SCHED_MAX_SPLITS ) ;
split = & sched - > splits [ i_split ] ;
split - > backend_id = node_backend_id ;
split - > i_start = i ;
split - > n_inputs = 0 ;
cur_backend_id = node_backend_id ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
2024-01-12 20:07:38 +01:00
// find inputs that are not on the same backend
for ( int j = 0 ; j < GGML_MAX_SRC ; j + + ) {
struct ggml_tensor * src = node - > src [ j ] ;
if ( src = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2024-01-12 20:07:38 +01:00
}
2024-03-13 18:54:21 +01:00
2024-03-18 11:03:04 +01:00
const int src_backend_id = tensor_backend_id ( src ) ;
2024-02-12 08:16:06 +01:00
assert ( src_backend_id ! = - 1 ) ; // all inputs should be assigned by now
2024-03-13 18:54:21 +01:00
2024-06-13 03:11:35 +02:00
if ( src - > flags & GGML_TENSOR_FLAG_INPUT & & sched - > n_copies > 1 ) {
2024-03-13 18:54:21 +01:00
size_t id = hash_id ( src ) ;
if ( sched - > tensor_copies [ id ] [ src_backend_id ] [ 0 ] = = NULL ) {
ggml_backend_t backend = sched - > backends [ src_backend_id ] ;
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
struct ggml_tensor * tensor_copy ;
if ( c = = sched - > cur_copy ) {
tensor_copy = src ; // use the original tensor as the current copy
} else {
tensor_copy = ggml_dup_tensor_layout ( sched - > ctx , src ) ;
ggml_format_name ( tensor_copy , " %s#%s#%d " , ggml_backend_name ( backend ) , src - > name , c ) ;
}
if ( sched - > n_copies > 1 ) {
ggml_set_input ( tensor_copy ) ;
ggml_set_output ( tensor_copy ) ; // prevent ggml-alloc from overwriting the tensor
}
sched - > tensor_copies [ id ] [ src_backend_id ] [ c ] = tensor_copy ;
SET_CAUSE ( tensor_copy , " 4.cpy " ) ;
}
int n_graph_inputs = sched - > n_graph_inputs + + ;
GGML_ASSERT ( n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS ) ;
sched - > graph_inputs [ n_graph_inputs ] = src ;
}
}
2024-06-13 03:11:35 +02:00
bool supported = ggml_backend_sched_buffer_supported ( sched , src , cur_backend_id ) ;
if ( src_backend_id ! = cur_backend_id & & ! supported ) {
2024-01-20 16:05:49 +01:00
// create a copy of the input in the split's backend
2024-03-18 11:03:04 +01:00
const size_t id = hash_id ( src ) ;
2024-03-13 18:54:21 +01:00
if ( sched - > tensor_copies [ id ] [ cur_backend_id ] [ 0 ] = = NULL ) {
2024-02-12 08:16:06 +01:00
ggml_backend_t backend = sched - > backends [ cur_backend_id ] ;
2024-03-13 18:54:21 +01:00
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout ( sched - > ctx , src ) ;
ggml_format_name ( tensor_copy , " %s#%s#%d " , ggml_backend_name ( backend ) , src - > name , c ) ;
if ( sched - > n_copies > 1 ) {
ggml_set_input ( tensor_copy ) ;
ggml_set_output ( tensor_copy ) ; // prevent ggml-alloc from overwriting the tensor
}
sched - > tensor_copies [ id ] [ cur_backend_id ] [ c ] = tensor_copy ;
SET_CAUSE ( tensor_copy , " 4.cpy " ) ;
}
2024-03-18 11:03:04 +01:00
int n_inputs = split - > n_inputs + + ;
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS ) ;
2024-03-18 11:03:04 +01:00
split - > inputs [ n_inputs ] = src ;
2024-01-20 16:05:49 +01:00
}
2024-03-13 18:54:21 +01:00
node - > src [ j ] = sched - > tensor_copies [ id ] [ cur_backend_id ] [ sched - > cur_copy ] ;
2023-11-13 13:16:23 +01:00
}
}
}
2024-03-18 11:03:04 +01:00
split - > i_end = graph - > n_nodes ;
sched - > n_splits = i_split + 1 ;
2023-11-13 13:16:23 +01:00
}
2024-06-13 03:11:35 +02:00
if ( sched - > debug ) {
ggml_backend_sched_print_assignments ( sched , graph ) ;
}
// swap node_backend_ids and leaf_backend_ids and prevs
{
int * tmp = sched - > node_backend_ids ;
sched - > node_backend_ids = sched - > prev_node_backend_ids ;
sched - > prev_node_backend_ids = tmp ;
tmp = sched - > leaf_backend_ids ;
sched - > leaf_backend_ids = sched - > prev_leaf_backend_ids ;
sched - > prev_leaf_backend_ids = tmp ;
}
2023-11-13 13:16:23 +01:00
// create copies of the graph for each split
2024-03-13 18:54:21 +01:00
// TODO: avoid this copy
2024-03-18 11:03:04 +01:00
struct ggml_cgraph * graph_copy = ggml_new_graph_custom ( sched - > ctx , graph - > n_nodes + sched - > n_splits * GGML_SCHED_MAX_SPLIT_INPUTS * 2 , false ) ;
2023-11-13 13:16:23 +01:00
for ( int i = 0 ; i < sched - > n_splits ; i + + ) {
struct ggml_backend_sched_split * split = & sched - > splits [ i ] ;
2023-12-07 21:26:54 +01:00
split - > graph = ggml_graph_view ( graph , split - > i_start , split - > i_end ) ;
2023-11-13 13:16:23 +01:00
2024-03-13 18:54:21 +01:00
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
2023-11-13 13:16:23 +01:00
for ( int j = 0 ; j < split - > n_inputs ; j + + ) {
2024-03-18 11:03:04 +01:00
assert ( graph_copy - > size > ( graph_copy - > n_nodes + 1 ) ) ;
2023-11-13 13:16:23 +01:00
struct ggml_tensor * input = split - > inputs [ j ] ;
2024-03-18 11:03:04 +01:00
const size_t input_id = hash_id ( input ) ;
struct ggml_tensor * input_cpy = sched - > tensor_copies [ input_id ] [ split - > backend_id ] [ sched - > cur_copy ] ;
2024-02-12 08:16:06 +01:00
2024-01-12 20:07:38 +01:00
// add a dependency to the input source so that it is not freed before the copy is done
2024-02-12 08:16:06 +01:00
struct ggml_tensor * input_dep = ggml_view_tensor ( sched - > ctx , input ) ;
2024-03-13 18:54:21 +01:00
input_dep - > src [ 0 ] = input ;
2024-03-18 11:03:04 +01:00
sched - > node_backend_ids [ graph_copy - > n_nodes ] = sched - > tensor_backend_id [ input_id ] ;
2024-02-12 08:16:06 +01:00
graph_copy - > nodes [ graph_copy - > n_nodes + + ] = input_dep ;
// add a dependency to the input copy so that it is allocated at the start of the split
sched - > node_backend_ids [ graph_copy - > n_nodes ] = split - > backend_id ;
2023-11-13 13:16:23 +01:00
graph_copy - > nodes [ graph_copy - > n_nodes + + ] = input_cpy ;
}
for ( int j = split - > i_start ; j < split - > i_end ; j + + ) {
2024-03-18 11:03:04 +01:00
assert ( graph_copy - > size > graph_copy - > n_nodes ) ;
2024-02-12 08:16:06 +01:00
sched - > node_backend_ids [ graph_copy - > n_nodes ] = tensor_backend_id ( graph - > nodes [ j ] ) ;
2023-11-13 13:16:23 +01:00
graph_copy - > nodes [ graph_copy - > n_nodes + + ] = graph - > nodes [ j ] ;
}
}
2024-03-13 18:54:21 +01:00
if ( sched - > n_copies > 1 ) {
// add input copies as leafs so that they are allocated first
for ( int i = 0 ; i < sched - > n_graph_inputs ; i + + ) {
struct ggml_tensor * input = sched - > graph_inputs [ i ] ;
size_t id = hash_id ( input ) ;
int backend_id = tensor_backend_id ( input ) ;
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
struct ggml_tensor * input_cpy = sched - > tensor_copies [ id ] [ backend_id ] [ c ] ;
sched - > leaf_backend_ids [ graph_copy - > n_leafs ] = backend_id ;
graph_copy - > leafs [ graph_copy - > n_leafs + + ] = input_cpy ;
}
}
for ( int i = 0 ; i < sched - > n_splits ; i + + ) {
struct ggml_backend_sched_split * split = & sched - > splits [ i ] ;
int backend_id = split - > backend_id ;
for ( int j = 0 ; j < split - > n_inputs ; j + + ) {
struct ggml_tensor * input = split - > inputs [ j ] ;
size_t id = hash_id ( input ) ;
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
struct ggml_tensor * input_cpy = sched - > tensor_copies [ id ] [ backend_id ] [ c ] ;
sched - > leaf_backend_ids [ graph_copy - > n_leafs ] = backend_id ;
graph_copy - > leafs [ graph_copy - > n_leafs + + ] = input_cpy ;
}
}
}
}
// add leafs from the original graph
for ( int i = 0 ; i < graph - > n_leafs ; i + + ) {
struct ggml_tensor * leaf = graph - > leafs [ i ] ;
sched - > leaf_backend_ids [ graph_copy - > n_leafs ] = tensor_backend_id ( leaf ) ;
graph_copy - > leafs [ graph_copy - > n_leafs + + ] = leaf ;
}
2023-11-13 13:16:23 +01:00
sched - > graph = graph_copy ;
}
2024-02-12 08:16:06 +01:00
static bool ggml_backend_sched_alloc_splits ( ggml_backend_sched_t sched ) {
2024-06-13 03:11:35 +02:00
bool backend_ids_changed = false ;
for ( int i = 0 ; i < sched - > graph - > n_nodes ; i + + ) {
if ( sched - > node_backend_ids [ i ] ! = sched - > prev_node_backend_ids [ i ] ) {
backend_ids_changed = true ;
break ;
}
}
if ( ! backend_ids_changed ) {
for ( int i = 0 ; i < sched - > graph - > n_leafs ; i + + ) {
if ( sched - > leaf_backend_ids [ i ] ! = sched - > prev_leaf_backend_ids [ i ] ) {
backend_ids_changed = true ;
break ;
}
}
}
2024-03-13 18:54:21 +01:00
// allocate graph
2024-06-13 03:11:35 +02:00
if ( backend_ids_changed | | ! ggml_gallocr_alloc_graph ( sched - > galloc , sched - > graph ) ) {
2024-03-13 18:54:21 +01:00
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize ( sched ) ;
2024-02-12 08:16:06 +01:00
# ifndef NDEBUG
2024-03-13 18:54:21 +01:00
fprintf ( stderr , " %s: failed to allocate graph, reserving \n " , __func__ ) ;
2024-02-12 08:16:06 +01:00
# endif
2024-03-13 18:54:21 +01:00
ggml_gallocr_reserve_n ( sched - > galloc , sched - > graph , sched - > node_backend_ids , sched - > leaf_backend_ids ) ;
2024-02-12 08:16:06 +01:00
if ( ! ggml_gallocr_alloc_graph ( sched - > galloc , sched - > graph ) ) {
2024-03-13 18:54:21 +01:00
fprintf ( stderr , " %s: failed to allocate graph \n " , __func__ ) ;
2024-02-12 08:16:06 +01:00
return false ;
}
}
return true ;
2023-11-13 13:16:23 +01:00
}
2024-03-04 10:05:42 +01:00
static enum ggml_status ggml_backend_sched_compute_splits ( ggml_backend_sched_t sched ) {
2023-11-13 13:16:23 +01:00
struct ggml_backend_sched_split * splits = sched - > splits ;
for ( int i = 0 ; i < sched - > n_splits ; i + + ) {
struct ggml_backend_sched_split * split = & splits [ i ] ;
2024-02-12 08:16:06 +01:00
int split_backend_id = split - > backend_id ;
ggml_backend_t split_backend = sched - > backends [ split_backend_id ] ;
2023-11-13 13:16:23 +01:00
// copy the input tensors to the split backend
for ( int j = 0 ; j < split - > n_inputs ; j + + ) {
2024-03-13 18:54:21 +01:00
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend ( sched , split - > inputs [ j ] ) ;
2023-12-07 21:26:54 +01:00
struct ggml_tensor * input = split - > inputs [ j ] ;
2024-03-13 18:54:21 +01:00
struct ggml_tensor * input_cpy = sched - > tensor_copies [ hash_id ( input ) ] [ split_backend_id ] [ sched - > cur_copy ] ;
2024-01-12 20:07:38 +01:00
2024-03-13 18:54:21 +01:00
if ( input - > flags & GGML_TENSOR_FLAG_INPUT ) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if ( sched - > events [ split_backend_id ] [ sched - > cur_copy ] ! = NULL ) {
ggml_backend_event_synchronize ( sched - > events [ split_backend_id ] [ sched - > cur_copy ] ) ;
} else {
ggml_backend_synchronize ( split_backend ) ;
}
ggml_backend_tensor_copy ( input , input_cpy ) ;
} else {
2024-03-18 11:03:04 +01:00
// wait for the split backend to finish using the input before overwriting it
2024-03-13 18:54:21 +01:00
if ( sched - > events [ split_backend_id ] [ sched - > cur_copy ] ! = NULL ) {
ggml_backend_event_wait ( split_backend , sched - > events [ split_backend_id ] [ sched - > cur_copy ] ) ;
} else {
ggml_backend_synchronize ( split_backend ) ;
}
ggml_backend_tensor_copy_async ( input_backend , split_backend , input , input_cpy ) ;
}
2023-11-13 13:16:23 +01:00
}
2024-01-17 17:39:41 +01:00
if ( ! sched - > callback_eval ) {
2024-03-13 18:54:21 +01:00
enum ggml_status ec = ggml_backend_graph_compute_async ( split_backend , & split - > graph ) ;
2024-03-04 10:05:42 +01:00
if ( ec ! = GGML_STATUS_SUCCESS ) {
return ec ;
2024-02-12 08:16:06 +01:00
}
2024-01-17 17:39:41 +01:00
} else {
// similar to ggml_backend_compare_graph_backend
for ( int j0 = 0 ; j0 < split - > graph . n_nodes ; j0 + + ) {
struct ggml_tensor * t = split - > graph . nodes [ j0 ] ;
// check if the user needs data from this node
bool need = sched - > callback_eval ( t , true , sched - > callback_eval_user_data ) ;
int j1 = j0 ;
// determine the range [j0, j1] of nodes that can be computed together
while ( ! need & & j1 < split - > graph . n_nodes - 1 ) {
t = split - > graph . nodes [ + + j1 ] ;
need = sched - > callback_eval ( t , true , sched - > callback_eval_user_data ) ;
}
struct ggml_cgraph gv = ggml_graph_view ( & split - > graph , j0 , j1 + 1 ) ;
2024-03-13 18:54:21 +01:00
enum ggml_status ec = ggml_backend_graph_compute_async ( split_backend , & gv ) ;
2024-03-04 10:05:42 +01:00
if ( ec ! = GGML_STATUS_SUCCESS ) {
return ec ;
2024-02-12 08:16:06 +01:00
}
2024-01-17 17:39:41 +01:00
2024-03-13 18:54:21 +01:00
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
ggml_backend_synchronize ( split_backend ) ;
2024-01-17 17:39:41 +01:00
if ( need & & ! sched - > callback_eval ( t , false , sched - > callback_eval_user_data ) ) {
break ;
}
j0 = j1 ;
}
}
2023-11-13 13:16:23 +01:00
2024-03-13 18:54:21 +01:00
// record the event of this copy
if ( split - > n_inputs > 0 ) {
if ( sched - > events [ split_backend_id ] [ sched - > cur_copy ] ! = NULL ) {
ggml_backend_event_record ( sched - > events [ split_backend_id ] [ sched - > cur_copy ] ) ;
}
2023-11-13 13:16:23 +01:00
}
}
2024-03-13 18:54:21 +01:00
sched - > cur_copy = ( sched - > cur_copy + 1 ) % sched - > n_copies ;
2024-01-12 20:07:38 +01:00
2024-03-04 10:05:42 +01:00
return GGML_STATUS_SUCCESS ;
2023-11-13 13:16:23 +01:00
}
2024-03-13 18:54:21 +01:00
ggml_backend_sched_t ggml_backend_sched_new (
ggml_backend_t * backends ,
ggml_backend_buffer_type_t * bufts ,
int n_backends ,
size_t graph_size ,
bool parallel ) {
2024-01-12 20:07:38 +01:00
GGML_ASSERT ( n_backends > 0 ) ;
2024-03-13 18:54:21 +01:00
GGML_ASSERT ( n_backends < = GGML_SCHED_MAX_BACKENDS ) ;
GGML_ASSERT ( ggml_backend_is_cpu ( backends [ n_backends - 1 ] ) ) ; // last backend must be CPU
2023-11-13 13:16:23 +01:00
2024-04-22 16:05:06 +02:00
struct ggml_backend_sched * sched = calloc ( 1 , sizeof ( struct ggml_backend_sched ) ) ;
2024-01-12 20:07:38 +01:00
2024-06-13 03:11:35 +02:00
sched - > debug = getenv ( " GGML_SCHED_DEBUG " ) ! = NULL ;
2024-01-12 20:07:38 +01:00
// initialize hash table
2024-03-18 11:03:04 +01:00
sched - > hash_set = ggml_hash_set_new ( graph_size ) ;
2024-04-22 16:05:06 +02:00
sched - > tensor_backend_id = calloc ( sched - > hash_set . size , sizeof ( sched - > tensor_backend_id [ 0 ] ) ) ;
sched - > tensor_copies = calloc ( sched - > hash_set . size , sizeof ( sched - > tensor_copies [ 0 ] ) ) ;
2024-03-18 11:03:04 +01:00
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS * GGML_SCHED_MAX_SPLIT_INPUTS * 2 ;
2024-04-22 16:05:06 +02:00
sched - > node_backend_ids = calloc ( nodes_size , sizeof ( sched - > node_backend_ids [ 0 ] ) ) ;
sched - > leaf_backend_ids = calloc ( nodes_size , sizeof ( sched - > leaf_backend_ids [ 0 ] ) ) ;
2024-06-13 03:11:35 +02:00
sched - > prev_node_backend_ids = calloc ( nodes_size , sizeof ( sched - > prev_node_backend_ids [ 0 ] ) ) ;
sched - > prev_leaf_backend_ids = calloc ( nodes_size , sizeof ( sched - > prev_leaf_backend_ids [ 0 ] ) ) ;
2023-11-13 13:16:23 +01:00
sched - > n_backends = n_backends ;
2024-03-13 18:54:21 +01:00
sched - > n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1 ;
2024-03-18 11:03:04 +01:00
const int initial_splits_capacity = 16 ;
2024-04-22 16:05:06 +02:00
sched - > splits = calloc ( initial_splits_capacity , sizeof ( sched - > splits [ 0 ] ) ) ;
2024-03-18 11:03:04 +01:00
sched - > splits_capacity = initial_splits_capacity ;
2024-03-13 18:54:21 +01:00
for ( int b = 0 ; b < n_backends ; b + + ) {
sched - > backends [ b ] = backends [ b ] ;
sched - > bufts [ b ] = bufts ? bufts [ b ] : ggml_backend_get_default_buffer_type ( backends [ b ] ) ;
2024-06-13 03:11:35 +02:00
GGML_ASSERT ( ggml_backend_supports_buft ( backends [ b ] , sched - > bufts [ b ] ) ) ;
2024-03-13 18:54:21 +01:00
if ( sched - > n_copies > 1 ) {
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
sched - > events [ b ] [ c ] = ggml_backend_event_new ( backends [ b ] ) ;
}
}
2023-11-13 13:16:23 +01:00
}
2024-02-12 08:16:06 +01:00
sched - > galloc = ggml_gallocr_new_n ( sched - > bufts , n_backends ) ;
2023-11-13 13:16:23 +01:00
2024-02-12 08:16:06 +01:00
ggml_backend_sched_reset ( sched ) ;
2024-01-12 20:07:38 +01:00
2023-11-13 13:16:23 +01:00
return sched ;
}
void ggml_backend_sched_free ( ggml_backend_sched_t sched ) {
if ( sched = = NULL ) {
return ;
}
2024-03-13 18:54:21 +01:00
for ( int b = 0 ; b < sched - > n_backends ; b + + ) {
for ( int c = 0 ; c < sched - > n_copies ; c + + ) {
ggml_backend_event_free ( sched - > events [ b ] [ c ] ) ;
}
}
2023-11-13 13:16:23 +01:00
ggml_gallocr_free ( sched - > galloc ) ;
2024-01-12 20:07:38 +01:00
ggml_free ( sched - > ctx ) ;
2024-03-18 11:03:04 +01:00
free ( sched - > splits ) ;
2023-11-13 13:16:23 +01:00
free ( sched - > hash_set . keys ) ;
2024-02-12 08:16:06 +01:00
free ( sched - > tensor_backend_id ) ;
free ( sched - > tensor_copies ) ;
free ( sched - > node_backend_ids ) ;
2024-03-13 18:54:21 +01:00
free ( sched - > leaf_backend_ids ) ;
2024-06-13 03:11:35 +02:00
free ( sched - > prev_node_backend_ids ) ;
free ( sched - > prev_leaf_backend_ids ) ;
2023-11-13 13:16:23 +01:00
free ( sched ) ;
}
2024-02-12 08:16:06 +01:00
void ggml_backend_sched_reset ( ggml_backend_sched_t sched ) {
// reset state for the next run
2024-04-26 20:08:30 +02:00
if ( ! sched - > is_reset ) {
size_t hash_size = sched - > hash_set . size ;
memset ( sched - > hash_set . keys , 0 , sizeof ( sched - > hash_set . keys [ 0 ] ) * hash_size ) ; // NOLINT
memset ( sched - > tensor_backend_id , - 1 , sizeof ( sched - > tensor_backend_id [ 0 ] ) * hash_size ) ;
memset ( sched - > tensor_copies , 0 , sizeof ( sched - > tensor_copies [ 0 ] ) * hash_size ) ;
2023-11-13 13:16:23 +01:00
2024-04-26 20:08:30 +02:00
sched - > is_reset = true ;
}
2024-03-13 18:54:21 +01:00
sched - > is_alloc = false ;
2024-02-12 08:16:06 +01:00
}
2023-11-13 13:16:23 +01:00
2024-02-12 08:16:06 +01:00
bool ggml_backend_sched_reserve ( ggml_backend_sched_t sched , struct ggml_cgraph * measure_graph ) {
2024-03-18 11:03:04 +01:00
GGML_ASSERT ( ( int ) sched - > hash_set . size > = measure_graph - > n_nodes ) ;
2024-02-12 08:16:06 +01:00
ggml_backend_sched_split_graph ( sched , measure_graph ) ;
2024-03-13 18:54:21 +01:00
// TODO: extract this to a separate function
if ( ! ggml_gallocr_reserve_n ( sched - > galloc , sched - > graph , sched - > node_backend_ids , sched - > leaf_backend_ids ) ) {
2024-02-12 08:16:06 +01:00
return false ;
2023-11-13 13:16:23 +01:00
}
2024-02-12 08:16:06 +01:00
ggml_backend_sched_reset ( sched ) ;
2024-03-13 18:54:21 +01:00
ggml_backend_sched_synchronize ( sched ) ;
return true ;
}
bool ggml_backend_sched_alloc_graph ( ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
2024-03-18 11:03:04 +01:00
GGML_ASSERT ( ( int ) sched - > hash_set . size > = graph - > n_nodes ) ;
2024-03-13 18:54:21 +01:00
ggml_backend_sched_split_graph ( sched , graph ) ;
if ( ! ggml_backend_sched_alloc_splits ( sched ) ) {
return false ;
}
sched - > is_alloc = true ;
2024-02-12 08:16:06 +01:00
return true ;
2023-11-13 13:16:23 +01:00
}
2024-03-04 10:05:42 +01:00
enum ggml_status ggml_backend_sched_graph_compute ( ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
2024-03-13 18:54:21 +01:00
enum ggml_status err = ggml_backend_sched_graph_compute_async ( sched , graph ) ;
ggml_backend_sched_synchronize ( sched ) ;
return err ;
}
2024-01-12 20:07:38 +01:00
2024-03-13 18:54:21 +01:00
enum ggml_status ggml_backend_sched_graph_compute_async ( ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
if ( ! sched - > is_reset & & ! sched - > is_alloc ) {
2024-02-12 08:16:06 +01:00
ggml_backend_sched_reset ( sched ) ;
2024-01-12 20:07:38 +01:00
}
2023-11-13 13:16:23 +01:00
2024-03-13 18:54:21 +01:00
if ( ! sched - > is_alloc ) {
if ( ! ggml_backend_sched_alloc_graph ( sched , graph ) ) {
return GGML_STATUS_ALLOC_FAILED ;
}
2024-02-12 08:16:06 +01:00
}
2024-01-12 20:07:38 +01:00
2024-03-04 10:05:42 +01:00
return ggml_backend_sched_compute_splits ( sched ) ;
2024-02-12 08:16:06 +01:00
}
2024-01-17 17:39:41 +01:00
2024-03-13 18:54:21 +01:00
void ggml_backend_sched_synchronize ( ggml_backend_sched_t sched ) {
for ( int i = 0 ; i < sched - > n_backends ; i + + ) {
ggml_backend_synchronize ( sched - > backends [ i ] ) ;
}
}
2024-01-17 17:39:41 +01:00
void ggml_backend_sched_set_eval_callback ( ggml_backend_sched_t sched , ggml_backend_sched_eval_callback callback , void * user_data ) {
sched - > callback_eval = callback ;
sched - > callback_eval_user_data = user_data ;
}
2024-01-12 20:07:38 +01:00
int ggml_backend_sched_get_n_splits ( ggml_backend_sched_t sched ) {
return sched - > n_splits ;
}
2024-03-13 18:54:21 +01:00
int ggml_backend_sched_get_n_copies ( ggml_backend_sched_t sched ) {
return sched - > n_copies ;
}
2024-02-12 08:16:06 +01:00
size_t ggml_backend_sched_get_buffer_size ( ggml_backend_sched_t sched , ggml_backend_t backend ) {
int backend_index = ggml_backend_sched_backend_id ( sched , backend ) ;
2024-01-12 20:07:38 +01:00
GGML_ASSERT ( backend_index > = 0 & & backend_index < sched - > n_backends ) ;
2024-03-13 18:54:21 +01:00
2024-02-12 08:16:06 +01:00
return ggml_gallocr_get_buffer_size ( sched - > galloc , backend_index ) ;
2023-11-13 13:16:23 +01:00
}
2024-03-13 18:54:21 +01:00
void ggml_backend_sched_set_tensor_backend ( ggml_backend_sched_t sched , struct ggml_tensor * node , ggml_backend_t backend ) {
2024-02-12 08:16:06 +01:00
int backend_index = ggml_backend_sched_backend_id ( sched , backend ) ;
2023-11-13 13:16:23 +01:00
GGML_ASSERT ( backend_index > = 0 & & backend_index < sched - > n_backends ) ;
2024-02-12 08:16:06 +01:00
tensor_backend_id ( node ) = backend_index ;
2024-06-13 03:11:35 +02:00
SET_CAUSE ( node , " usr " ) ;
2023-11-13 13:16:23 +01:00
}
2023-12-07 21:26:54 +01:00
2024-03-13 18:54:21 +01:00
ggml_backend_t ggml_backend_sched_get_tensor_backend ( ggml_backend_sched_t sched , struct ggml_tensor * node ) {
2024-02-12 08:16:06 +01:00
int backend_index = tensor_backend_id ( node ) ;
if ( backend_index = = - 1 ) {
2024-01-12 20:07:38 +01:00
return NULL ;
}
2024-02-12 08:16:06 +01:00
return sched - > backends [ backend_index ] ;
2024-01-12 20:07:38 +01:00
}
2023-12-07 21:26:54 +01:00
// utils
2024-01-12 20:07:38 +01:00
2024-06-03 19:03:26 +02:00
void ggml_backend_view_init ( struct ggml_tensor * tensor ) {
2023-12-07 21:26:54 +01:00
GGML_ASSERT ( tensor - > buffer = = NULL ) ;
GGML_ASSERT ( tensor - > view_src ! = NULL ) ;
GGML_ASSERT ( tensor - > view_src - > buffer ! = NULL ) ;
GGML_ASSERT ( tensor - > view_src - > data ! = NULL ) ;
2024-06-03 19:03:26 +02:00
tensor - > buffer = tensor - > view_src - > buffer ;
2023-12-07 21:26:54 +01:00
tensor - > data = ( char * ) tensor - > view_src - > data + tensor - > view_offs ;
2024-06-03 19:03:26 +02:00
ggml_backend_buffer_init_tensor ( tensor - > buffer , tensor ) ;
2023-12-07 21:26:54 +01:00
}
void ggml_backend_tensor_alloc ( ggml_backend_buffer_t buffer , struct ggml_tensor * tensor , void * addr ) {
GGML_ASSERT ( tensor - > buffer = = NULL ) ;
GGML_ASSERT ( tensor - > data = = NULL ) ;
GGML_ASSERT ( tensor - > view_src = = NULL ) ;
GGML_ASSERT ( addr > = ggml_backend_buffer_get_base ( buffer ) ) ;
GGML_ASSERT ( ( char * ) addr + ggml_backend_buffer_get_alloc_size ( buffer , tensor ) < =
( char * ) ggml_backend_buffer_get_base ( buffer ) + ggml_backend_buffer_get_size ( buffer ) ) ;
tensor - > buffer = buffer ;
tensor - > data = addr ;
ggml_backend_buffer_init_tensor ( buffer , tensor ) ;
}
2024-02-12 08:16:06 +01:00
static struct ggml_tensor * graph_copy_dup_tensor ( struct ggml_hash_set hash_set , struct ggml_tensor * * node_copies ,
2023-12-07 21:26:54 +01:00
struct ggml_context * ctx_allocated , struct ggml_context * ctx_unallocated , struct ggml_tensor * src ) {
GGML_ASSERT ( src ! = NULL ) ;
GGML_ASSERT ( src - > data & & " graph must be allocated " ) ;
size_t id = ggml_hash_insert ( hash_set , src ) ;
if ( id = = GGML_HASHTABLE_ALREADY_EXISTS ) {
return node_copies [ ggml_hash_find ( hash_set , src ) ] ;
}
struct ggml_tensor * dst = ggml_dup_tensor_layout ( src - > data & & ! src - > view_src ? ctx_allocated : ctx_unallocated , src ) ;
if ( src - > view_src ! = NULL ) {
2024-02-12 08:16:06 +01:00
dst - > view_src = graph_copy_dup_tensor ( hash_set , node_copies , ctx_allocated , ctx_unallocated , src - > view_src ) ;
2023-12-07 21:26:54 +01:00
dst - > view_offs = src - > view_offs ;
}
dst - > op = src - > op ;
memcpy ( dst - > op_params , src - > op_params , sizeof ( dst - > op_params ) ) ;
ggml_set_name ( dst , src - > name ) ;
// copy src
for ( int i = 0 ; i < GGML_MAX_SRC ; i + + ) {
struct ggml_tensor * s = src - > src [ i ] ;
if ( s = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2023-12-07 21:26:54 +01:00
}
2024-02-12 08:16:06 +01:00
dst - > src [ i ] = graph_copy_dup_tensor ( hash_set , node_copies , ctx_allocated , ctx_unallocated , s ) ;
2023-12-07 21:26:54 +01:00
}
node_copies [ id ] = dst ;
return dst ;
}
2024-02-12 08:16:06 +01:00
static void graph_copy_init_tensor ( struct ggml_hash_set hash_set , struct ggml_tensor * * node_copies , bool * node_init , struct ggml_tensor * src ) {
2023-12-07 21:26:54 +01:00
size_t id = ggml_hash_find ( hash_set , src ) ;
if ( node_init [ id ] ) {
return ;
}
node_init [ id ] = true ;
struct ggml_tensor * dst = node_copies [ id ] ;
if ( dst - > view_src ! = NULL ) {
2024-02-12 08:16:06 +01:00
graph_copy_init_tensor ( hash_set , node_copies , node_init , src - > view_src ) ;
2024-06-03 19:03:26 +02:00
ggml_backend_view_init ( dst ) ;
2023-12-07 21:26:54 +01:00
}
else {
ggml_backend_tensor_copy ( src , dst ) ;
}
// init src
for ( int i = 0 ; i < GGML_MAX_SRC ; i + + ) {
struct ggml_tensor * s = src - > src [ i ] ;
if ( s = = NULL ) {
2024-02-17 22:04:16 +01:00
continue ;
2023-12-07 21:26:54 +01:00
}
2024-02-12 08:16:06 +01:00
graph_copy_init_tensor ( hash_set , node_copies , node_init , s ) ;
2023-12-07 21:26:54 +01:00
}
}
struct ggml_backend_graph_copy ggml_backend_graph_copy ( ggml_backend_t backend , struct ggml_cgraph * graph ) {
struct ggml_hash_set hash_set = {
/* .size = */ graph - > visited_hash_table . size ,
2024-04-22 16:05:06 +02:00
/* .keys = */ calloc ( graph - > visited_hash_table . size , sizeof ( hash_set . keys [ 0 ] ) ) // NOLINT
2023-12-07 21:26:54 +01:00
} ;
2024-04-22 16:05:06 +02:00
struct ggml_tensor * * node_copies = calloc ( hash_set . size , sizeof ( node_copies [ 0 ] ) ) ; // NOLINT
bool * node_init = calloc ( hash_set . size , sizeof ( node_init [ 0 ] ) ) ;
2023-12-07 21:26:54 +01:00
struct ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead ( ) * hash_set . size + ggml_graph_overhead_custom ( graph - > size , false ) ,
/* .mem_buffer = */ NULL ,
/* .no_alloc = */ true
} ;
struct ggml_context * ctx_allocated = ggml_init ( params ) ;
struct ggml_context * ctx_unallocated = ggml_init ( params ) ;
2024-01-12 20:07:38 +01:00
if ( ctx_allocated = = NULL | | ctx_unallocated = = NULL ) {
fprintf ( stderr , " failed to allocate context for graph copy \n " ) ;
free ( hash_set . keys ) ;
free ( node_copies ) ;
free ( node_init ) ;
ggml_free ( ctx_allocated ) ;
ggml_free ( ctx_unallocated ) ;
return ( struct ggml_backend_graph_copy ) {
/* .buffer = */ NULL ,
/* .ctx_allocated = */ NULL ,
/* .ctx_unallocated = */ NULL ,
/* .graph = */ NULL ,
} ;
}
2023-12-07 21:26:54 +01:00
// dup nodes
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
2024-02-12 08:16:06 +01:00
graph_copy_dup_tensor ( hash_set , node_copies , ctx_allocated , ctx_unallocated , node ) ;
2023-12-07 21:26:54 +01:00
}
// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors ( ctx_allocated , backend ) ;
2024-01-12 20:07:38 +01:00
if ( buffer = = NULL ) {
fprintf ( stderr , " failed to allocate buffer for graph copy \n " ) ;
free ( hash_set . keys ) ;
free ( node_copies ) ;
free ( node_init ) ;
ggml_free ( ctx_allocated ) ;
ggml_free ( ctx_unallocated ) ;
return ( struct ggml_backend_graph_copy ) {
/* .buffer = */ NULL ,
/* .ctx_allocated = */ NULL ,
/* .ctx_unallocated = */ NULL ,
/* .graph = */ NULL ,
} ;
}
2023-12-07 21:26:54 +01:00
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
// copy data and init views
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
2024-02-12 08:16:06 +01:00
graph_copy_init_tensor ( hash_set , node_copies , node_init , node ) ;
2023-12-07 21:26:54 +01:00
}
// build graph copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom ( ctx_allocated , graph - > size , false ) ;
for ( int i = 0 ; i < graph - > n_nodes ; i + + ) {
struct ggml_tensor * node = graph - > nodes [ i ] ;
struct ggml_tensor * node_copy = node_copies [ ggml_hash_find ( hash_set , node ) ] ;
graph_copy - > nodes [ i ] = node_copy ;
}
graph_copy - > n_nodes = graph - > n_nodes ;
free ( hash_set . keys ) ;
free ( node_copies ) ;
free ( node_init ) ;
return ( struct ggml_backend_graph_copy ) {
/* .buffer = */ buffer ,
/* .ctx_allocated = */ ctx_allocated ,
/* .ctx_unallocated = */ ctx_unallocated ,
/* .graph = */ graph_copy ,
} ;
}
void ggml_backend_graph_copy_free ( struct ggml_backend_graph_copy copy ) {
ggml_backend_buffer_free ( copy . buffer ) ;
ggml_free ( copy . ctx_allocated ) ;
ggml_free ( copy . ctx_unallocated ) ;
}
2024-01-12 20:07:38 +01:00
bool ggml_backend_compare_graph_backend ( ggml_backend_t backend1 , ggml_backend_t backend2 , struct ggml_cgraph * graph , ggml_backend_eval_callback callback , void * user_data ) {
2023-12-07 21:26:54 +01:00
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy ( backend2 , graph ) ;
2024-01-12 20:07:38 +01:00
if ( copy . buffer = = NULL ) {
return false ;
}
2023-12-07 21:26:54 +01:00
struct ggml_cgraph * g1 = graph ;
struct ggml_cgraph * g2 = copy . graph ;
assert ( g1 - > n_nodes = = g2 - > n_nodes ) ;
for ( int i = 0 ; i < g1 - > n_nodes ; i + + ) {
//printf("eval %d/%d\n", i, g1->n_nodes);
struct ggml_tensor * t1 = g1 - > nodes [ i ] ;
struct ggml_tensor * t2 = g2 - > nodes [ i ] ;
assert ( t1 - > op = = t2 - > op & & ggml_are_same_layout ( t1 , t2 ) ) ;
struct ggml_cgraph g1v = ggml_graph_view ( g1 , i , i + 1 ) ;
struct ggml_cgraph g2v = ggml_graph_view ( g2 , i , i + 1 ) ;
ggml_backend_graph_compute ( backend1 , & g1v ) ;
ggml_backend_graph_compute ( backend2 , & g2v ) ;
if ( ggml_is_view_op ( t1 - > op ) ) {
continue ;
}
// compare results, calculate rms etc
if ( ! callback ( i , t1 , t2 , user_data ) ) {
break ;
}
}
ggml_backend_graph_copy_free ( copy ) ;
2024-01-12 20:07:38 +01:00
return true ;
2023-12-07 21:26:54 +01:00
}