mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-27 04:23:06 +01:00
parent
99a3755a3c
commit
c07d437bbd
@ -7,14 +7,12 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
// TODO: replace with ggml API call
|
|
||||||
#define QK_K 256
|
|
||||||
|
|
||||||
static void zeros(std::ofstream & file, size_t n) {
|
static void zeros(std::ofstream & file, size_t n) {
|
||||||
char zero = 0;
|
char zero = 0;
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||||
new_type = qs.params->output_tensor_type;
|
new_type = qs.params->output_tensor_type;
|
||||||
} else {
|
} else {
|
||||||
int nx = tensor->ne[0];
|
const int64_t nx = tensor->ne[0];
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
const int64_t qk_k = ggml_blck_size(new_type);
|
||||||
|
|
||||||
|
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||||
@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||||
//}
|
//}
|
||||||
bool convert_incompatible_tensor = false;
|
bool convert_incompatible_tensor = false;
|
||||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
{
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
const int64_t nx = tensor->ne[0];
|
||||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
const int64_t ny = tensor->ne[1];
|
||||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
const int64_t qk_k = ggml_blck_size(new_type);
|
||||||
new_type == GGML_TYPE_IQ1_M) {
|
|
||||||
int nx = tensor->ne[0];
|
if (nx % qk_k != 0) {
|
||||||
int ny = tensor->ne[1];
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||||
if (nx % QK_K != 0) {
|
|
||||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
} else {
|
} else {
|
||||||
++qs.n_k_quantized;
|
++qs.n_k_quantized;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
|
Loading…
Reference in New Issue
Block a user