server : Add option to return token pieces in /tokenize endpoint (#9108)

* server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-27 06:39:25 +01:00 · 2024-09-12 22:30:11 +02:00 · 2024-09-12 22:30:11 +02:00 · 78203641fe
commit 78203641fe
parent e6b7801bd1
6 changed files with 139 additions and 6 deletions
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -173,6 +173,7 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
      - name: Slow tests
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
    *Options:*
-    `content`: Set the text to tokenize.
+    `content`: (Required) The text to tokenize.
-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
 **Response:**
 Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
 If `with_pieces` is `false`:
 ```json
 {
  "tokens": [123, 456, 789]
 }
 ```
 If `with_pieces` is `true`:
 ```json
 {
  "tokens": [
    {"id": 123, "piece": "Hello"},
    {"id": 456, "piece": " world"},
    {"id": 789, "piece": "!"}
  ]
 }
 ```
 With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
 ```json
 {
  "tokens": [
    {"id": 198, "piece": [195]}, // hex C3
    {"id": 164, "piece": [161]} // hex A1
  ]
 }
 ```
 ### POST `/detokenize`: Convert tokens to text
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
    const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
        const json body = json::parse(req.body);
-        std::vector<llama_token> tokens;
+        json tokens_response = json::array();
        if (body.count("content") != 0) {
            const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            const bool with_pieces = json_value(body, "with_pieces", false);
            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
            if (with_pieces) {
                for (const auto& token : tokens) {
                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
                    json piece_json;
                    // Check if the piece is valid UTF-8
                    if (is_valid_utf8(piece)) {
                        piece_json = piece;
                    } else {
                        // If not valid UTF-8, store as array of byte values
                        piece_json = json::array();
                        for (unsigned char c : piece) {
                            piece_json.push_back(static_cast<int>(c));
                        }
                    }
                    tokens_response.push_back({
                        {"id", token},
                        {"piece", piece_json}
                    });
                }
            } else {
                tokens_response = tokens;
            }
        }
-        const json data = format_tokenizer_response(tokens);
+
        const json data = format_tokenizer_response(tokens_response);
        res_ok(res, data);
    };
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -105,6 +105,14 @@ Feature: llama.cpp server
    Given first token is removed
    Then  tokens can be detokenized
  Scenario: Tokenize with pieces
    When  tokenizing with pieces:
    """
    What is the capital of Germany?
    媽
    """
    Then  tokens are given with pieces
  Scenario: Models available
    Given available models
    Then  1 models are supported
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1,3 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import asyncio
 import json
 import os
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
    context.tokenize_add_special = True
@step("tokenizing with pieces")
@async_run_until_complete
 async def step_tokenize_with_pieces(context):
    context.tokenized_text = context_text(context)
    async with aiohttp.ClientSession() as session:
        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
        if getattr(context, "tokenize_add_special", None) is not None:
            tokenize_args["add_special"] = context.tokenize_add_special
        async with session.post(
            f"{context.base_url}/tokenize", json=tokenize_args
        ) as response:
            assert response.status == 200
            tokenize_json = await response.json()
            context.tokens_with_pieces = tokenize_json["tokens"]
@step("tokens are given with pieces")
@async_run_until_complete
 async def step_tokenize_with_pieces(context):
    # Verify that the response contains both token IDs and pieces
    assert all(
        "id" in token and "piece" in token for token in context.tokens_with_pieces
    )
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static bool is_valid_utf8(const std::string & str) {
    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
    const unsigned char* end = bytes + str.length();
    while (bytes < end) {
        if (*bytes <= 0x7F) {
            // 1-byte sequence (0xxxxxxx)
            bytes++;
        } else if ((*bytes & 0xE0) == 0xC0) {
            // 2-byte sequence (110xxxxx 10xxxxxx)
            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
                return false;
            bytes += 2;
        } else if ((*bytes & 0xF0) == 0xE0) {
            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
                return false;
            bytes += 3;
        } else if ((*bytes & 0xF8) == 0xF0) {
            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
                return false;
            bytes += 4;
        } else {
            // Invalid UTF-8 lead byte
            return false;
        }
    }
    return true;
 }
 static json format_tokenizer_response(const json & tokens) {
    return json {
        {"tokens", tokens}
    };