mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-02-05 08:00:42 +01:00
parent
80d0d6b4b7
commit
2139667ec4
@ -4416,7 +4416,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
device const half * dh = &x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
@ -4447,7 +4446,7 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -4613,7 +4612,7 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
if (tiisg == 0) {
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
dst_f32[first_row + row] = sumf1[row];
|
||||
}
|
||||
}
|
||||
@ -4729,7 +4728,7 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -4861,7 +4860,7 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = tot;
|
||||
@ -4906,6 +4905,10 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
const int row = 2*r0 + sgitg;
|
||||
|
||||
if (row >= args.ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
|
||||
@ -5061,7 +5064,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@ -5179,7 +5182,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@ -5289,7 +5292,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.5f;
|
||||
@ -5401,7 +5404,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -5514,7 +5517,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@ -5614,7 +5617,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -5709,7 +5712,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -5799,7 +5802,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@ -5888,7 +5891,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
|
Loading…
Reference in New Issue
Block a user