llama.swiftui : add bench functionality (#4483)

* llama.swiftui : add bench button * llama.swiftui : initial bench functionality * force to use n_gpu_layers on simulator * add download buttons & expose llamaState.loadModel * update project.pbxproj * comment #Preview & fix editorconfig check * gitignore : xcode stuff * llama.swiftui : UX improvements * llama.swiftui : avoid data copy via "downloadTask" * llama.swiftui : remove model from project * llama : remove "mostly" from model infos * llama.swiftui : improve bench --------- Co-authored-by: jhen <developer@jhen.me>
2024-12-25 05:48:47 +01:00 · 2023-12-17 19:38:41 +02:00 · 2023-12-17 19:38:41 +02:00 · 800a489e4a
commit 800a489e4a
parent f7f468a97d
8 changed files with 895 additions and 510 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -23,3 +23,6 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@ -1 +1,2 @@
 xcuserdata
 xcshareddata
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -6,16 +6,34 @@ enum LlamaError: Error {
    case couldNotInitializeContext
 }
 func llama_batch_clear(_ batch: inout llama_batch) {
    batch.n_tokens = 0
 }
 func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
    batch.token   [Int(batch.n_tokens)] = id
    batch.pos     [Int(batch.n_tokens)] = pos
    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
    for i in 0..<seq_ids.count {
        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
    }
    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
    batch.n_tokens += 1
 }
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]
-    var n_len: Int32 = 512
+    var n_len: Int32 = 64
    var n_cur: Int32 = 0
    var n_decode: Int32 = 0
    init(model: OpaquePointer, context: OpaquePointer) {
@ -27,25 +45,34 @@ actor LlamaContext {
    }
    deinit {
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
        llama_backend_free()
    }
-    static func createContext(path: String) throws -> LlamaContext {
+    static func create_context(path: String) throws -> LlamaContext {
        llama_backend_init(false)
-        let model_params = llama_model_default_params()
+        var model_params = llama_model_default_params()
 #if targetEnvironment(simulator)
        model_params.n_gpu_layers = 0
        print("Running on simulator, force use n_gpu_layers = 0")
 #endif
        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
        }
        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
        print("Using \(n_threads) threads")
        var ctx_params = llama_context_default_params()
        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads = 8
+        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = 8
+        ctx_params.n_threads_batch = UInt32(n_threads)
        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@ -56,6 +83,26 @@ actor LlamaContext {
        return LlamaContext(model: model, context: context)
    }
    func model_info() -> String {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
        result.initialize(repeating: Int8(0), count: 256)
        defer {
            result.deallocate()
        }
        // TODO: this is probably very stupid way to get the string from C
        let nChars = llama_model_desc(model, result, 256)
        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
        var SwiftString = ""
        for char in bufferPointer {
            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
        }
        return SwiftString
    }
    func get_n_tokens() -> Int32 {
        return batch.n_tokens;
    }
@ -79,16 +126,11 @@ actor LlamaContext {
            print(String(cString: token_to_piece(token: id) + [0]))
        }
-        // batch = llama_batch_init(512, 0) // done in init()
+        llama_batch_clear(&batch)
        batch.n_tokens = Int32(tokens_list.count)
-        for i1 in 0..<batch.n_tokens {
+        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            batch.token[i] = tokens_list[i]
+            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
            batch.pos[i] = i1
            batch.n_seq_id[Int(i)] = 1
            batch.seq_id[Int(i)]![0] = 0
            batch.logits[i] = 0
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@ -141,17 +183,10 @@ actor LlamaContext {
        print(new_token_str)
        // tokens_list.append(new_token_id)
-        batch.n_tokens = 0
+        llama_batch_clear(&batch)
-
+        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
        batch.n_seq_id[Int(batch.n_tokens)] = 1
        batch.seq_id[Int(batch.n_tokens)]![0] = 0
        batch.logits[Int(batch.n_tokens)] = 1 // true
        batch.n_tokens += 1
        n_decode += 1
        n_cur    += 1
        if llama_decode(context, batch) != 0 {
@ -161,14 +196,111 @@ actor LlamaContext {
        return new_token_str
    }
    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
        var pp_avg: Double = 0
        var tg_avg: Double = 0
        var pp_std: Double = 0
        var tg_std: Double = 0
        for r in 0..<nr {
            // bench prompt processing
            llama_batch_clear(&batch)
            let n_tokens = pp
            for i in 0..<n_tokens {
                llama_batch_add(&batch, 0, Int32(i), [0], false)
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
            llama_kv_cache_clear(context)
            let t_pp_start = ggml_time_us()
            if llama_decode(context, batch) != 0 {
                print("llama_decode() failed during prompt")
            }
            let t_pp_end = ggml_time_us()
            // bench text generation
            llama_kv_cache_clear(context)
            let t_tg_start = ggml_time_us()
            for i in 0..<tg {
                llama_batch_clear(&batch)
                for j in 0..<pl {
                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
                }
                if llama_decode(context, batch) != 0 {
                    print("llama_decode() failed during text generation")
                }
            }
            let t_tg_end = ggml_time_us()
            llama_kv_cache_clear(context)
            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
            let speed_pp = Double(pp)    / t_pp
            let speed_tg = Double(pl*tg) / t_tg
            pp_avg += speed_pp
            tg_avg += speed_tg
            pp_std += speed_pp * speed_pp
            tg_std += speed_tg * speed_tg
            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
        }
        pp_avg /= Double(nr)
        tg_avg /= Double(nr)
        if nr > 1 {
            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
        } else {
            pp_std = 0
            tg_std = 0
        }
        let model_desc     = model_info();
        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
        let backend        = "Metal";
        let pp_avg_str     = String(format: "%.2f", pp_avg);
        let tg_avg_str     = String(format: "%.2f", tg_avg);
        let pp_std_str     = String(format: "%.2f", pp_std);
        let tg_std_str     = String(format: "%.2f", tg_std);
        var result = ""
        result += String("| model | size | params | backend | test | t/s |\n")
        result += String("| --- | --- | --- | --- | --- | --- |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
        return result;
    }
    func clear() {
        tokens_list.removeAll()
        temporary_invalid_cchars.removeAll()
        llama_kv_cache_clear(context)
    }
    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0)
+        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@ -7,14 +7,15 @@
 	objects = {
 /* Begin PBXBuildFile section */
-        542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
+		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
+		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
 		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
-        542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
+		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
 		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
 		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
 		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
@ -40,6 +41,7 @@
 		549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
 		549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
 		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
 		8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
 		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
@ -47,7 +49,6 @@
 		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
        8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
@ -147,7 +148,6 @@
 		8A3F84112AC4BD8C005E2EE8 /* models */ = {
 			isa = PBXGroup;
 			children = (
                8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
 			);
 			path = models;
 			sourceTree = "<group>";
@ -164,6 +164,7 @@
 		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
 			isa = PBXGroup;
 			children = (
 				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
 				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
 			);
 			path = UI;
@ -262,6 +263,7 @@
 				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
 				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
 				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
 				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
 				542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
 				5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
 			);
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -3,24 +3,26 @@ import Foundation
@MainActor
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
    @Published var cacheCleared = false
    private var llamaContext: LlamaContext?
-    private var modelUrl: URL? {
+    private var defaultModelUrl: URL? {
-        Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
+        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
    }
    init() {
        do {
-            try loadModel()
+            try loadModel(modelUrl: defaultModelUrl)
        } catch {
            messageLog += "Error!\n"
        }
    }
-    private func loadModel() throws {
+    func loadModel(modelUrl: URL?) throws {
        messageLog += "Loading model...\n"
        if let modelUrl {
-            llamaContext = try LlamaContext.createContext(path: modelUrl.path())
+            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
            messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
        guard let llamaContext else {
            return
        }
-        messageLog += "Attempting to complete text...\n"
+
        await llamaContext.completion_init(text: text)
        messageLog += "\(text)"
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
        await llamaContext.clear()
        messageLog += "\n\ndone\n"
    }
    func bench() async {
        guard let llamaContext else {
            return
        }
        messageLog += "\n"
        messageLog += "Running benchmark...\n"
        messageLog += "Model info: "
        messageLog += await llamaContext.model_info() + "\n"
        let t_start = DispatchTime.now().uptimeNanoseconds
        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
        let t_end = DispatchTime.now().uptimeNanoseconds
        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
        // if more than 5 seconds, then we're probably running on a slow device
        if t_heat > 5.0 {
            messageLog += "Heat up time is too long, aborting benchmark\n"
            return
        }
        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
        messageLog += "\(result)"
        messageLog += "\n"
    }
    func clear() async {
        guard let llamaContext else {
            return
        }
        await llamaContext.clear()
        messageLog = ""
    }
 }
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -5,24 +5,97 @@ struct ContentView: View {
    @State private var multiLineText = ""
    private static func cleanupModelCaches() {
        // Delete all models (*.gguf)
        let fileManager = FileManager.default
        let documentsUrl =  FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
        do {
            let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
            for fileURL in fileURLs {
                if fileURL.pathExtension == "gguf" {
                    try fileManager.removeItem(at: fileURL)
                }
            }
        } catch {
            print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
        }
    }
    var body: some View {
        VStack {
-            ScrollView(.vertical) {
+            ScrollView(.vertical, showsIndicators: true) {
                Text(llamaState.messageLog)
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                .padding()
                .onTapGesture {
                    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
                }
            }
            TextEditor(text: $multiLineText)
-                .frame(height: 200)
+                .frame(height: 80)
                .padding()
                .border(Color.gray, width: 0.5)
-            Button(action: {
+
            HStack {
                Button("Send") {
                    sendText()
-            }) {
+                }
-                Text("Send")
+                .padding(8)
                    .padding()
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Bench") {
                    bench()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Clear") {
                    clear()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Copy") {
                    UIPasteboard.general.string = llamaState.messageLog
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
            }
            VStack {
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q4_0)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
                )
                .font(.system(size: 12))
                .padding(.top, 4)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q8_0)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
                )
                .font(.system(size: 12))
                Button("Clear downloaded models") {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
                .padding(8)
                .font(.system(size: 12))
            }
        }
        .padding()
@ -34,9 +107,20 @@ struct ContentView: View {
            multiLineText = ""
        }
    }
    func bench() {
        Task {
            await llamaState.bench()
        }
 /*
 #Preview {
    ContentView()
    }
-*/
+
    func clear() {
        Task {
            await llamaState.clear()
        }
    }
 }
 //#Preview {
 //    ContentView()
 //}
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -0,0 +1,122 @@
 import SwiftUI
 struct DownloadButton: View {
    @ObservedObject private var llamaState: LlamaState
    private var modelName: String
    private var modelUrl: String
    private var filename: String
    @State private var status: String
    @State private var downloadTask: URLSessionDownloadTask?
    @State private var progress = 0.0
    @State private var observation: NSKeyValueObservation?
    private static func getFileURL(filename: String) -> URL {
        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
    }
    private func checkFileExistenceAndUpdateStatus() {
    }
    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
        self.llamaState = llamaState
        self.modelName = modelName
        self.modelUrl = modelUrl
        self.filename = filename
        let fileURL = DownloadButton.getFileURL(filename: filename)
        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
    }
    private func download() {
        status = "downloading"
        print("Downloading model \(modelName) from \(modelUrl)")
        guard let url = URL(string: modelUrl) else { return }
        let fileURL = DownloadButton.getFileURL(filename: filename)
        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
            if let error = error {
                print("Error: \(error.localizedDescription)")
                return
            }
            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
                print("Server error!")
                return
            }
            do {
                if let temporaryURL = temporaryURL {
                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
                    print("Writing to \(filename) completed")
                    llamaState.cacheCleared = false
                    status = "downloaded"
                }
            } catch let err {
                print("Error: \(err.localizedDescription)")
            }
        }
        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
            self.progress = progress.fractionCompleted
        }
        downloadTask?.resume()
    }
    var body: some View {
        VStack {
            if status == "download" {
                Button(action: download) {
                    Text("Download " + modelName)
                }
            } else if status == "downloading" {
                Button(action: {
                    downloadTask?.cancel()
                    status = "download"
                }) {
                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
                }
            } else if status == "downloaded" {
                Button(action: {
                    let fileURL = DownloadButton.getFileURL(filename: filename)
                    if !FileManager.default.fileExists(atPath: fileURL.path) {
                        download()
                        return
                    }
                    do {
                        try llamaState.loadModel(modelUrl: fileURL)
                    } catch let err {
                        print("Error: \(err.localizedDescription)")
                    }
                }) {
                    Text("\(modelName) (Downloaded)")
                }
            } else {
                Text("Unknown status")
            }
        }
        .onDisappear() {
            downloadTask?.cancel()
        }
        .onChange(of: llamaState.cacheCleared) { newValue in
            if newValue {
                downloadTask?.cancel()
                let fileURL = DownloadButton.getFileURL(filename: filename)
                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
            }
        }
    }
 }
 // #Preview {
 //    DownloadButton(
 //        llamaState: LlamaState(),
 //        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
 //        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
 //        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
 //    )
 // }
--- a/llama.cpp
+++ b/llama.cpp
@ -2397,25 +2397,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
+        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                                      return "mostly Q4_1, some F16";
+                                      return "Q4_1, some F16";
-        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "mostly Q2_K";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "mostly Q6_K";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
        default: return "unknown, may not work";
    }
@ -2533,6 +2533,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 22: model.type = e_model::MODEL_1B; break;
                    case 26: model.type = e_model::MODEL_3B; break;
                    case 32: model.type = e_model::MODEL_7B; break;
                    case 40: model.type = e_model::MODEL_13B; break;