mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-26 20:22:25 +01:00
CUDA: faster multi GPU synchronization (#2448)
This commit is contained in:
parent
8a88e5855c
commit
9baf9ef304
@ -3529,13 +3529,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||||||
if (split) {
|
if (split) {
|
||||||
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
||||||
// dst is NOT transposed.
|
// dst is NOT transposed.
|
||||||
// The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
||||||
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
||||||
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
||||||
for (int64_t j = 0; j < ne1; ++j) {
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
|
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
|
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
||||||
|
Loading…
Reference in New Issue
Block a user