mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 13:58:46 +01:00
Vulkan Fixes (#5223)
* Fix Vulkan F16 models * Fix Vulkan context shift crash * Add Vulkan to common.cpp dump_non_result_info_yaml function * Fix bug in Vulkan CPY op * Fix small matrix multiplication errors in AMD GPUs on Windows or with amdvlk Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> --------- Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com>
This commit is contained in:
parent
d62520eb2c
commit
f8e9140cb4
@ -1520,6 +1520,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -817,7 +817,7 @@ static void ggml_vk_load_shaders() {
|
|||||||
// mulmat
|
// mulmat
|
||||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
|
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
|
||||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
|
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
|
||||||
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 8, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
||||||
|
|
||||||
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
||||||
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
||||||
@ -2873,7 +2873,8 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
|||||||
if (op == GGML_OP_CPY) {
|
if (op == GGML_OP_CPY) {
|
||||||
GGML_ASSERT(!transfer_src0);
|
GGML_ASSERT(!transfer_src0);
|
||||||
GGML_ASSERT(!transfer_src1);
|
GGML_ASSERT(!transfer_src1);
|
||||||
d_sz = dst->ne[1] * dst->nb[1];
|
x_sz = ggml_nbytes(src0);
|
||||||
|
d_sz = ggml_nbytes(dst);
|
||||||
|
|
||||||
if (extra->offset + d_sz >= d_D->size) {
|
if (extra->offset + d_sz >= d_D->size) {
|
||||||
d_sz = VK_WHOLE_SIZE;
|
d_sz = VK_WHOLE_SIZE;
|
||||||
@ -4556,8 +4557,15 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|||||||
}
|
}
|
||||||
ggml_vk_preallocate_buffers();
|
ggml_vk_preallocate_buffers();
|
||||||
|
|
||||||
|
int last_node = cgraph->n_nodes - 1;
|
||||||
|
|
||||||
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||||
|
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
||||||
|
last_node -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
ggml_vk_build_graph(cgraph->nodes[i], i == last_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
ggml_compute_params params = {};
|
||||||
|
@ -19,8 +19,8 @@ shader_int8_ext = """
|
|||||||
|
|
||||||
# Type-specific defines
|
# Type-specific defines
|
||||||
shader_f16_defines = """
|
shader_f16_defines = """
|
||||||
#define QUANT_K 32
|
#define QUANT_K 1
|
||||||
#define QUANT_R 2
|
#define QUANT_R 1
|
||||||
|
|
||||||
#define A_TYPE float16_t
|
#define A_TYPE float16_t
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user