From 8388aaa604ce25d7b036b475d01825e1977187fb Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 16 May 2023 15:16:00 +0300 Subject: [PATCH] cleanup and stuff --- examples/common.cpp | 8 +++++-- examples/common.h | 6 +++--- examples/main/main.cpp | 48 +++++++++++++++++++++++------------------- llama.cpp | 23 +++++++++++--------- 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index eea8500c4..aaf6e27a9 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -362,12 +362,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.steering_mul = std::stof(argv[i]); - } else if (arg == "--steering-lyr") { + } else if (arg == "--steering-layer") { if (++i >= argc) { invalid_param = true; break; } - params.steering_lyr = std::stoi(argv[i]); + params.steering_layer = std::stoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); @@ -454,6 +454,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { } fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " --steering-add add positive steering prompt\n"); + fprintf(stderr, " --steering-sub add negativ steering prompt\n"); + fprintf(stderr, " --steering-mul set steering strength (negative is reverse, default %.1f)\n", params.steering_mul); + fprintf(stderr, " --steering-layer set layer for steering (default %d)\n", params.steering_layer); fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); diff --git a/examples/common.h b/examples/common.h index f2c836ae3..e56ad648e 100644 --- a/examples/common.h +++ b/examples/common.h @@ -73,10 +73,10 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation - std::string steering_add = ""; - std::string steering_sub = ""; + std::string steering_add; + std::string steering_sub; float steering_mul = 1.0f; - int steering_lyr = 20; + int steering_layer = 15; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8ae64b93c..ffa779e05 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -136,28 +136,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.steering_add.size() || params.steering_sub.size()) - { - auto steering_add_tokens = ::llama_tokenize(ctx, params.steering_add, true); - auto steering_sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); - - if (steering_add_tokens.size() != steering_sub_tokens.size()) { - llama_token space; - llama_tokenize(ctx, " ", &space, 1, 0); - - while (steering_add_tokens.size() < steering_sub_tokens.size()) steering_add_tokens.push_back(space); - while (steering_sub_tokens.size() < steering_add_tokens.size()) steering_sub_tokens.push_back(space); - } - - llama_set_steering_write(ctx, params.steering_lyr, params.steering_mul/2); - llama_eval(ctx, steering_add_tokens.data(), std::min((int)steering_add_tokens.size(), params.n_ctx), 0, params.n_threads); - - llama_set_steering_write(ctx, params.steering_lyr, -params.steering_mul/2); - llama_eval(ctx, steering_sub_tokens.data(), std::min((int)steering_sub_tokens.size(), params.n_ctx), 0, params.n_threads); - - llama_set_steering_read(ctx, params.steering_lyr, 1); - } - // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); @@ -196,6 +174,32 @@ int main(int argc, char ** argv) { return 1; } + if (!params.steering_add.empty() || !params.steering_sub.empty()) + { + params.steering_add.insert(0, 1, ' '); + params.steering_sub.insert(0, 1, ' '); + + auto add_tokens = ::llama_tokenize(ctx, params.steering_add, true); + auto sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); + + //if (add_tokens.size() != sub_tokens.size()) { + // while (add_tokens.size() < sub_tokens.size()) { + // add_tokens.push_back(llama_token_nl()); + // } + // while (sub_tokens.size() < add_tokens.size()) { + // sub_tokens.push_back(llama_token_nl()); + // } + //} + //const int N = embd_inp.size(); + llama_set_steering_write(ctx, params.steering_layer, +1.0f); + llama_eval(ctx, add_tokens.data(), std::min((int)add_tokens.size(), n_ctx), 0, params.n_threads); + + llama_set_steering_write(ctx, params.steering_layer, -1.0f); + llama_eval(ctx, sub_tokens.data(), std::min((int)sub_tokens.size(), n_ctx), 0, params.n_threads); + + llama_set_steering_read(ctx, params.steering_layer, params.steering_mul); + } + // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; if (session_tokens.size()) { diff --git a/llama.cpp b/llama.cpp index 61afe7d62..5e85e55d5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -287,6 +287,9 @@ void llama_set_steering_read(struct llama_context * ctx, int layer, float mul) { ctx->steering_mode = STEERING_READ; ctx->steering_mul = mul; ctx->steering_layer = layer; + //FILE* steeringbin = fopen("steering.bin", "wb"); + //fwrite(ctx->steering_vector.data(), sizeof(float), ctx->steering_vector.size(), steeringbin); + //fclose(steeringbin); } template @@ -1163,8 +1166,9 @@ static bool llama_eval_internal( struct ggml_tensor * steer; if (lctx.steering_mode != STEERING_OFF) { - steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_embd); - memcpy(steer->data, lctx.steering_vector.data(), ggml_nbytes(steer)); + steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + //steer->data = lctx.steering_vector.data() + n_past * n_embd * sizeof(float); + memcpy(steer->data, lctx.steering_vector.data() + n_past * n_embd * sizeof(float), ggml_nbytes(steer)); } struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1177,15 +1181,14 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 0); if (lctx.steering_mode != STEERING_OFF && il == lctx.steering_layer) { - steer->data = lctx.steering_vector.data(); - - struct ggml_tensor * src = ggml_scale(ctx0, inpL, ggml_new_f32(ctx0, lctx.steering_mul)); - struct ggml_tensor * dst = ggml_view_2d(ctx0, steer, n_embd, N, n_embd * sizeof(float), n_past * n_embd * sizeof(float)); + struct ggml_tensor * scal = ggml_new_f32(ctx0, lctx.steering_mul); if (lctx.steering_mode == STEERING_WRITE) { - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, ggml_add(ctx0, src, dst), dst)); - } else { - inpL = src; + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, + ggml_add(ctx0, ggml_scale(ctx0, inpL, scal), steer), steer)); + break; } + + inpL = ggml_add(ctx0, ggml_scale(ctx0, steer, scal), inpL); } // norm @@ -1403,7 +1406,7 @@ static bool llama_eval_internal( if (lctx.steering_mode == STEERING_WRITE) { - memcpy(lctx.steering_vector.data(), steer->data, ggml_nbytes(steer)); + memcpy(lctx.steering_vector.data() + n_past * n_embd * sizeof(float), steer->data, ggml_nbytes(steer)); }