mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 21:37:19 +01:00
Windows fixes (#31)
* Apply fixes suggested to build on windows Issue: https://github.com/ggerganov/llama.cpp/issues/22 * Remove unsupported VLAs * MSVC: Remove features that are only available on MSVC C++20. * Fix zero initialization of the other fields. * Change the use of vector for stack allocations.
This commit is contained in:
parent
7027a97837
commit
eb062bb012
20
ggml.c
20
ggml.c
@ -407,8 +407,8 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
const size_t bs = sizeof(float) + QK/2;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
uint8_t * restrict pd = (uint8_t *) (y + 0*bs);
|
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
|
||||||
uint8_t * restrict pb = (uint8_t *) (y + 0*bs + sizeof(float));
|
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
uint8_t pp[QK/2];
|
uint8_t pp[QK/2];
|
||||||
|
|
||||||
@ -654,8 +654,8 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
|
|||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
const size_t bs = sizeof(float) + QK/2;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
|
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
|
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
|
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
@ -1301,11 +1301,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||||||
|
|
||||||
const size_t bs = sizeof(float) + QK/2;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
const uint8_t * restrict pd0 = (const uint8_t *) (x + 0*bs);
|
const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
|
||||||
const uint8_t * restrict pd1 = (const uint8_t *) (y + 0*bs);
|
const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
|
||||||
|
|
||||||
const uint8_t * restrict pb0 = (const uint8_t *) (x + 0*bs + sizeof(float));
|
const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
const uint8_t * restrict pb1 = (const uint8_t *) (y + 0*bs + sizeof(float));
|
const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
float sumf = 0.0;
|
float sumf = 0.0;
|
||||||
|
|
||||||
@ -1731,8 +1731,8 @@ inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * res
|
|||||||
const int nb = n / QK;
|
const int nb = n / QK;
|
||||||
const size_t bs = sizeof(float) + QK/2;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
const uint8_t * restrict pd = (const uint8_t *) (x + 0*bs);
|
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||||
const uint8_t * restrict pb = (const uint8_t *) (x + 0*bs + sizeof(float));
|
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
|
|
||||||
#if __ARM_NEON
|
#if __ARM_NEON
|
||||||
#if QK == 32
|
#if QK == 32
|
||||||
|
12
main.cpp
12
main.cpp
@ -209,8 +209,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = ctx_size,
|
/*.mem_size =*/ ctx_size,
|
||||||
.mem_buffer = NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
@ -546,12 +546,13 @@ bool llama_eval(
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = buf_size,
|
/*.mem_size =*/ buf_size,
|
||||||
.mem_buffer = buf,
|
/*.mem_buffer =*/ buf,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph gf = { .n_threads = n_threads };
|
ggml_cgraph gf = {};
|
||||||
|
gf.n_threads = n_threads;
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
@ -733,6 +734,7 @@ bool llama_eval(
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
@ -289,6 +289,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
|||||||
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
fprintf(stderr, " type = 2 - q4_0\n");
|
fprintf(stderr, " type = 2 - q4_0\n");
|
||||||
|
16
utils.cpp
16
utils.cpp
@ -5,6 +5,12 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
|
#elif !defined(__FreeBSD__)
|
||||||
|
#include <alloca.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
std::string arg = argv[i];
|
std::string arg = argv[i];
|
||||||
@ -472,7 +478,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
uint8_t pp[qk/2];
|
const size_t pp_size = qk / 2;
|
||||||
|
uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
|
||||||
|
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
@ -511,7 +518,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb, pp, sizeof(pp));
|
memcpy(pb, pp, pp_size);
|
||||||
pb += bs;
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -526,7 +533,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
uint8_t pp[qk/2];
|
const size_t pp_size = qk / 2;
|
||||||
|
uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
|
||||||
|
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
@ -570,7 +578,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
|||||||
pp[l/2] = vi0 | (vi1 << 4);
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(pb + i*qk/2, pp, sizeof(pp));
|
memcpy(pb + i*qk/2, pp, pp_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user