diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 15a23ef6c..aa52cee64 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1390,12 +1390,14 @@ class LlamaModel(Model): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET + def set_vocab(self): self._set_vocab_sentencepiece() - + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_name("Bitnet") @@ -1407,9 +1409,7 @@ class BitnetModel(Model): self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) @@ -1430,6 +1430,7 @@ class BitnetModel(Model): return [(self.map_tensor_name(name), data_torch)] + @Model.register("GrokForCausalLM") class GrokModel(Model): model_arch = gguf.MODEL_ARCH.GROK diff --git a/ggml.c b/ggml.c index e59093cdf..562415d60 100644 --- a/ggml.c +++ b/ggml.c @@ -12349,7 +12349,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( // attempt to reduce false-sharing (does not seem to make a difference) // 16 * 2, accounting for mmla kernels float tmp[32]; - float * scale = (float * )((uint8_t*) (src0->data) + (ne00 * ne01 / 4)); + const float * scale = (float * )((uint8_t*) (src0->data) + (ne00 * ne01 / 4)); const float * act_scales = (const float*) ((const char *) wdata + (ne11 * ne10)); for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { diff --git a/llama.cpp b/llama.cpp index 0d271c748..c775ae79b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15961,6 +15961,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } + // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type;