2023-08-07 08:35:53 +03:00
|
|
|
// Compatible with Zig Version 0.11.0
|
2023-04-05 15:06:02 +00:00
|
|
|
const std = @import("std");
|
2023-08-17 23:11:18 +03:00
|
|
|
const ArrayList = std.ArrayList;
|
2023-08-07 08:35:53 +03:00
|
|
|
const Compile = std.Build.Step.Compile;
|
|
|
|
const ConfigHeader = std.Build.Step.ConfigHeader;
|
|
|
|
const Mode = std.builtin.Mode;
|
|
|
|
const CrossTarget = std.zig.CrossTarget;
|
2023-04-05 15:06:02 +00:00
|
|
|
|
2023-08-07 08:35:53 +03:00
|
|
|
const Maker = struct {
|
|
|
|
builder: *std.build.Builder,
|
|
|
|
target: CrossTarget,
|
|
|
|
optimize: Mode,
|
2023-08-17 23:11:18 +03:00
|
|
|
enable_lto: bool,
|
2023-08-07 08:35:53 +03:00
|
|
|
|
2023-08-17 23:11:18 +03:00
|
|
|
include_dirs: ArrayList([]const u8),
|
|
|
|
cflags: ArrayList([]const u8),
|
|
|
|
cxxflags: ArrayList([]const u8),
|
|
|
|
objs: ArrayList(*Compile),
|
2023-08-07 08:35:53 +03:00
|
|
|
|
2023-08-17 23:11:18 +03:00
|
|
|
fn addInclude(m: *Maker, dir: []const u8) !void {
|
|
|
|
try m.include_dirs.append(dir);
|
|
|
|
}
|
|
|
|
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
|
|
|
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
|
|
|
}
|
|
|
|
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
|
|
|
try m.cflags.append(flag);
|
|
|
|
}
|
|
|
|
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
|
|
|
try m.cxxflags.append(flag);
|
|
|
|
}
|
|
|
|
fn addFlag(m: *Maker, flag: []const u8) !void {
|
|
|
|
try m.addCFlag(flag);
|
|
|
|
try m.addCxxFlag(flag);
|
|
|
|
}
|
|
|
|
|
|
|
|
fn init(builder: *std.build.Builder) !Maker {
|
2023-09-21 21:08:20 +12:00
|
|
|
const target = builder.standardTargetOptions(.{});
|
2023-10-08 10:59:20 -03:00
|
|
|
const zig_version = @import("builtin").zig_version_string;
|
|
|
|
const commit_hash = try std.ChildProcess.exec(
|
|
|
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
|
|
|
);
|
2023-11-02 02:50:16 -04:00
|
|
|
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
|
|
|
\\int LLAMA_BUILD_NUMBER = {};
|
|
|
|
\\char const *LLAMA_COMMIT = "{s}";
|
|
|
|
\\char const *LLAMA_COMPILER = "Zig {s}";
|
|
|
|
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
|
|
|
\\
|
|
|
|
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
2023-08-17 23:11:18 +03:00
|
|
|
var m = Maker{
|
2023-08-07 08:35:53 +03:00
|
|
|
.builder = builder,
|
2023-09-21 21:08:20 +12:00
|
|
|
.target = target,
|
2023-08-07 08:35:53 +03:00
|
|
|
.optimize = builder.standardOptimizeOption(.{}),
|
2023-08-17 23:11:18 +03:00
|
|
|
.enable_lto = false,
|
|
|
|
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
|
|
|
.cflags = ArrayList([]const u8).init(builder.allocator),
|
|
|
|
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
|
|
|
.objs = ArrayList(*Compile).init(builder.allocator),
|
2023-08-07 08:35:53 +03:00
|
|
|
};
|
2023-11-02 02:50:16 -04:00
|
|
|
|
2023-08-17 23:11:18 +03:00
|
|
|
try m.addCFlag("-std=c11");
|
|
|
|
try m.addCxxFlag("-std=c++11");
|
|
|
|
try m.addProjectInclude(&.{});
|
2023-09-21 21:08:20 +12:00
|
|
|
try m.addProjectInclude(&.{"common"});
|
2023-08-17 23:11:18 +03:00
|
|
|
return m;
|
2023-08-07 08:35:53 +03:00
|
|
|
}
|
2023-07-14 11:50:58 -07:00
|
|
|
|
2023-08-07 08:35:53 +03:00
|
|
|
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
|
|
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
2023-10-08 10:59:20 -03:00
|
|
|
if (o.target.getAbi() != .msvc)
|
|
|
|
o.defineCMacro("_GNU_SOURCE", null);
|
2023-11-02 02:50:16 -04:00
|
|
|
|
2023-08-07 08:35:53 +03:00
|
|
|
if (std.mem.endsWith(u8, src, ".c")) {
|
2023-08-17 23:11:18 +03:00
|
|
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
2023-08-07 08:35:53 +03:00
|
|
|
o.linkLibC();
|
|
|
|
} else {
|
2023-08-17 23:11:18 +03:00
|
|
|
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
2023-10-08 10:59:20 -03:00
|
|
|
if (o.target.getAbi() == .msvc) {
|
|
|
|
o.linkLibC(); // need winsdk + crt
|
|
|
|
} else {
|
|
|
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
|
|
o.linkLibCpp();
|
|
|
|
}
|
2023-08-07 08:35:53 +03:00
|
|
|
}
|
2023-08-17 23:11:18 +03:00
|
|
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
|
|
|
o.want_lto = m.enable_lto;
|
2023-08-07 08:35:53 +03:00
|
|
|
return o;
|
|
|
|
}
|
|
|
|
|
|
|
|
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
|
|
|
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
2023-08-17 23:11:18 +03:00
|
|
|
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
2023-08-07 08:35:53 +03:00
|
|
|
for (deps) |d| e.addObject(d);
|
2023-08-17 23:11:18 +03:00
|
|
|
for (m.objs.items) |o| e.addObject(o);
|
|
|
|
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
2023-10-08 10:59:20 -03:00
|
|
|
|
|
|
|
// https://github.com/ziglang/zig/issues/15448
|
|
|
|
if (e.target.getAbi() == .msvc) {
|
|
|
|
e.linkLibC(); // need winsdk + crt
|
|
|
|
} else {
|
|
|
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
|
|
e.linkLibCpp();
|
|
|
|
}
|
2023-08-07 08:35:53 +03:00
|
|
|
m.builder.installArtifact(e);
|
2023-08-17 23:11:18 +03:00
|
|
|
e.want_lto = m.enable_lto;
|
2023-08-07 08:35:53 +03:00
|
|
|
return e;
|
|
|
|
}
|
|
|
|
};
|
2023-06-25 13:45:44 +08:00
|
|
|
|
2023-08-17 23:11:18 +03:00
|
|
|
pub fn build(b: *std.build.Builder) !void {
|
|
|
|
var make = try Maker.init(b);
|
|
|
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
|
|
|
|
2023-08-07 08:35:53 +03:00
|
|
|
const ggml = make.obj("ggml", "ggml.c");
|
ggml : add llamafile sgemm (#6414)
This change upstreams llamafile's cpu matrix multiplication kernels
which improve image and prompt evaluation speed. For starters, Q4_0
and Q8_0 weights should go ~40% faster on CPU. The biggest benefits
are with data types like f16 / f32, which process prompts 2x faster
thus making them faster than quantized data types for prompt evals.
This change also introduces bona fide AVX512 support since tinyBLAS
is able to exploit the larger register file. For example, on my CPU
llama.cpp llava-cli processes an image prompt at 305 tokens/second,
using the Q4_K and Q4_0 types, which has always been faster than if
we used f16 LLaVA weights, which at HEAD go 188 tokens/second. With
this change, f16 LLaVA performance leap frogs to 464 tokens/second.
On Intel Core i9-14900K this change improves F16 prompt perf by 5x.
For example, using llama.cpp at HEAD with Mistral 7b f16 to process
a 215 token prompt will go 13 tok/sec. This change has fixes making
it go 52 tok/sec. It's mostly thanks to my vectorized outer product
kernels but also because I added support for correctly counting the
number of cores on Alderlake, so the default thread count discounts
Intel's new efficiency cores. Only Linux right now can count cores.
This work was sponsored by Mozilla who's given permission to change
the license of this code from Apache 2.0 to MIT. To read more about
what's improved, and how it works, see: https://justine.lol/matmul/
2024-04-16 14:55:30 -04:00
|
|
|
const sgemm = make.obj("sgemm", "sgemm.cpp");
|
2023-08-07 08:35:53 +03:00
|
|
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
2023-10-08 20:19:14 +03:00
|
|
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
2023-10-29 18:32:28 +02:00
|
|
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
2024-03-11 17:47:47 +02:00
|
|
|
const unicode = make.obj("unicode", "unicode.cpp");
|
2024-03-26 17:46:21 -04:00
|
|
|
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
|
2023-08-07 08:35:53 +03:00
|
|
|
const llama = make.obj("llama", "llama.cpp");
|
2023-11-02 02:50:16 -04:00
|
|
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
2023-09-21 21:08:20 +12:00
|
|
|
const common = make.obj("common", "common/common.cpp");
|
2023-10-08 10:59:20 -03:00
|
|
|
const console = make.obj("console", "common/console.cpp");
|
2023-10-11 13:35:46 -06:00
|
|
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
2023-09-21 21:08:20 +12:00
|
|
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
json-schema-to-grammar improvements (+ added to server) (#5978)
* json: fix arrays (disallow `[,1]`)
* json: support tuple types (`[number, string]`)
* json: support additionalProperties (`{[k: string]: [string,number][]}`)
* json: support required / optional properties
* json: add support for pattern
* json: resolve $ref (and support https schema urls)
* json: fix $ref resolution
* join: support union types (mostly for nullable types I think)
* json: support allOf + nested anyOf
* json: support any (`{}` or `{type: object}`)
* json: fix merge
* json: temp fix for escapes
* json: spaces in output and unrestricted output spaces
* json: add typings
* json:fix typo
* Create ts-type-to-grammar.sh
* json: fix _format_literal (json.dumps already escapes quotes)
* json: merge lit sequences and handle negatives
{"type": "string", "pattern": "^({\"question\": \"[^\"]+\", \"response\": \"[^\"]+\"}\\n)+$"}
* json: handle pattern repetitions
* Update json-schema-to-grammar.mjs
* Create regex-to-grammar.py
* json: extract repeated regexp patterns to subrule
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* json: handle schema from pydantic Optional fields
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update ts-type-to-grammar.sh
* Update ts-type-to-grammar.sh
* json: simplify nullable fields handling
* json: accept duplicate identical rules
* json: revert space to 1 at most
* json: reuse regexp pattern subrules
* json: handle uuid string format
* json: fix literal escapes
* json: add --allow-fetch
* json: simplify range escapes
* json: support negative ranges in patterns
* Delete commit.txt
* json: custom regex parser, adds dot support & JS-portable
* json: rm trailing spaces
* Update json-schema-to-grammar.mjs
* json: updated server & chat `( cd examples/server && ./deps.sh )`
* json: port fixes from mjs to python
* Update ts-type-to-grammar.sh
* json: support prefixItems alongside array items
* json: add date format + fix uuid
* json: add date, time, date-time formats
* json: preserve order of props from TS defs
* json: port schema converter to C++, wire in ./server
* json: nits
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* json: fix mjs implementation + align outputs
* Update json-schema-to-grammar.mjs.hpp
* json: test C++, JS & Python versions
* json: nits + regen deps
* json: cleanup test
* json: revert from c++17 to 11
* json: nit fixes
* json: dirty include for test
* json: fix zig build
* json: pass static command to std::system in tests (fixed temp files)
* json: fix top-level $refs
* json: don't use c++20 designated initializers
* nit
* json: basic support for reserved names `{number:{number:{root:number}}}`
* Revamp test cmake to allow args (WORKING_DIRECTORY needed for JSON test)
* json: re-ran server deps.sh
* json: simplify test
* json: support mix of additional props & required/optional
* json: add tests for some expected failures
* json: fix type=const in c++, add failure expectations for non-str const&enum
* json: test (& simplify output of) empty schema
* json: check parsing in test + fix value & string refs
* json: add server tests for OAI JSON response_format
* json: test/fix top-level anyOf
* json: improve grammar parsing failures
* json: test/fix additional props corner cases
* json: fix string patterns (was missing quotes)
* json: ws nit
* json: fix json handling in server when there's no response_format
* json: catch schema conversion errors in server
* json: don't complain about unknown format type in server if unset
* json: cleaner build of test
* json: create examples/json-schema-pydantic-example.py
* json: fix date pattern
* json: move json.hpp & json-schema-to-grammar.{cpp,h} to common
* json: indent 4 spaces
* json: fix naming of top-level c++ function (+ drop unused one)
* json: avoid using namespace std
* json: fix zig build
* Update server.feature
* json: iostream -> fprintf
* json: space before & refs for consistency
* json: nits
2024-03-21 11:50:43 +00:00
|
|
|
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
2023-10-08 16:24:01 +08:00
|
|
|
const train = make.obj("train", "common/train.cpp");
|
2023-10-22 22:53:08 +03:00
|
|
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
2024-02-20 11:07:22 -08:00
|
|
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
2023-07-14 11:50:58 -07:00
|
|
|
|
ggml : add llamafile sgemm (#6414)
This change upstreams llamafile's cpu matrix multiplication kernels
which improve image and prompt evaluation speed. For starters, Q4_0
and Q8_0 weights should go ~40% faster on CPU. The biggest benefits
are with data types like f16 / f32, which process prompts 2x faster
thus making them faster than quantized data types for prompt evals.
This change also introduces bona fide AVX512 support since tinyBLAS
is able to exploit the larger register file. For example, on my CPU
llama.cpp llava-cli processes an image prompt at 305 tokens/second,
using the Q4_K and Q4_0 types, which has always been faster than if
we used f16 LLaVA weights, which at HEAD go 188 tokens/second. With
this change, f16 LLaVA performance leap frogs to 464 tokens/second.
On Intel Core i9-14900K this change improves F16 prompt perf by 5x.
For example, using llama.cpp at HEAD with Mistral 7b f16 to process
a 215 token prompt will go 13 tok/sec. This change has fixes making
it go 52 tok/sec. It's mostly thanks to my vectorized outer product
kernels but also because I added support for correctly counting the
number of cores on Alderlake, so the default thread count discounts
Intel's new efficiency cores. Only Linux right now can count cores.
This work was sponsored by Mozilla who's given permission to change
the license of this code from Apache 2.0 to MIT. To read more about
what's improved, and how it works, see: https://justine.lol/matmul/
2024-04-16 14:55:30 -04:00
|
|
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
|
|
|
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
|
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
|
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
|
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
|
|
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
2023-07-14 11:50:58 -07:00
|
|
|
|
ggml : add llamafile sgemm (#6414)
This change upstreams llamafile's cpu matrix multiplication kernels
which improve image and prompt evaluation speed. For starters, Q4_0
and Q8_0 weights should go ~40% faster on CPU. The biggest benefits
are with data types like f16 / f32, which process prompts 2x faster
thus making them faster than quantized data types for prompt evals.
This change also introduces bona fide AVX512 support since tinyBLAS
is able to exploit the larger register file. For example, on my CPU
llama.cpp llava-cli processes an image prompt at 305 tokens/second,
using the Q4_K and Q4_0 types, which has always been faster than if
we used f16 LLaVA weights, which at HEAD go 188 tokens/second. With
this change, f16 LLaVA performance leap frogs to 464 tokens/second.
On Intel Core i9-14900K this change improves F16 prompt perf by 5x.
For example, using llama.cpp at HEAD with Mistral 7b f16 to process
a 215 token prompt will go 13 tok/sec. This change has fixes making
it go 52 tok/sec. It's mostly thanks to my vectorized outer product
kernels but also because I added support for correctly counting the
number of cores on Alderlake, so the default thread count discounts
Intel's new efficiency cores. Only Linux right now can count cores.
This work was sponsored by Mozilla who's given permission to change
the license of this code from Apache 2.0 to MIT. To read more about
what's improved, and how it works, see: https://justine.lol/matmul/
2024-04-16 14:55:30 -04:00
|
|
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
2023-08-07 08:35:53 +03:00
|
|
|
if (server.target.isWindows()) {
|
|
|
|
server.linkSystemLibrary("ws2_32");
|
2023-04-05 15:06:02 +00:00
|
|
|
}
|
2024-04-21 18:48:53 +01:00
|
|
|
|
|
|
|
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
|
|
|
|
for (server_assets) |asset| {
|
|
|
|
const input_path = b.fmt("examples/server/public/{s}", .{asset});
|
|
|
|
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
|
|
|
|
|
|
|
|
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
|
|
|
|
|
|
|
|
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
|
|
|
|
defer b.allocator.free(input);
|
|
|
|
|
|
|
|
var buf = std.ArrayList(u8).init(b.allocator);
|
|
|
|
defer buf.deinit();
|
|
|
|
|
|
|
|
for (input) |byte| {
|
|
|
|
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
|
|
|
|
}
|
|
|
|
|
|
|
|
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
|
|
|
|
defer b.allocator.free(name);
|
|
|
|
std.mem.replaceScalar(u8, name, '.', '_');
|
|
|
|
|
|
|
|
try std.fs.cwd().writeFile(output_path, b.fmt(
|
|
|
|
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
|
|
|
|
.{ name, buf.items, name, input.len },
|
|
|
|
));
|
|
|
|
|
|
|
|
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
|
|
|
|
}
|
2023-04-05 15:06:02 +00:00
|
|
|
}
|