text-generation-webui/modules/llama_cpp_python_hijack.py

from typing import Sequence

from tqdm import tqdm

try:
    import llama_cpp
except:
    llama_cpp = None

try:
    import llama_cpp_cuda
except:
    llama_cpp_cuda = None

try:
    import llama_cpp_cuda_tensorcores
except:
    llama_cpp_cuda_tensorcores = None


def eval_with_progress(self, tokens: Sequence[int]):
    """
    A copy of

    https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py

    with tqdm to show prompt processing progress.
    """
    assert self._ctx.ctx is not None
    assert self._batch.batch is not None
    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)

    if len(tokens) > 1:
        progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
    else:
        progress_bar = range(0, len(tokens), self.n_batch)

    for i in progress_bar:
        batch = tokens[i : min(len(tokens), i + self.n_batch)]
        n_past = self.n_tokens
        n_tokens = len(batch)
        self._batch.set_batch(
            batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
        )
        self._ctx.decode(self._batch)
        # Save tokens
        self.input_ids[n_past : n_past + n_tokens] = batch
        # Save logits
        rows = n_tokens
        cols = self._n_vocab
        offset = (
            0 if self.context_params.logits_all else n_tokens - 1
        )  # NOTE: Only save the last token logits if logits_all is False
        self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
            :
        ] = self._ctx.get_logits()[offset * cols : rows * cols]
        # Update n_tokens
        self.n_tokens += n_tokens


for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
    if lib is not None:
        lib.Llama.eval = eval_with_progress
llama.cpp: add a progress bar for prompt evaluation 2024-02-08 06:40:58 +01:00			`from typing import Sequence`

			`from tqdm import tqdm`

			`try:`
			`import llama_cpp`
			`except:`
			`llama_cpp = None`

			`try:`
			`import llama_cpp_cuda`
			`except:`
			`llama_cpp_cuda = None`

			`try:`
			`import llama_cpp_cuda_tensorcores`
			`except:`
			`llama_cpp_cuda_tensorcores = None`


			`def eval_with_progress(self, tokens: Sequence[int]):`
			`"""`
			`A copy of`

			`https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py`

			`with tqdm to show prompt processing progress.`
			`"""`
			`assert self._ctx.ctx is not None`
			`assert self._batch.batch is not None`
			`self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)`

			`if len(tokens) > 1:`
			`progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)`
			`else:`
			`progress_bar = range(0, len(tokens), self.n_batch)`

			`for i in progress_bar:`
			`batch = tokens[i : min(len(tokens), i + self.n_batch)]`
			`n_past = self.n_tokens`
			`n_tokens = len(batch)`
			`self._batch.set_batch(`
			`batch=batch, n_past=n_past, logits_all=self.context_params.logits_all`
			`)`
			`self._ctx.decode(self._batch)`
			`# Save tokens`
			`self.input_ids[n_past : n_past + n_tokens] = batch`
			`# Save logits`
			`rows = n_tokens`
			`cols = self._n_vocab`
			`offset = (`
			`0 if self.context_params.logits_all else n_tokens - 1`
			`) # NOTE: Only save the last token logits if logits_all is False`
			`self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[`
			`:`
			`] = self._ctx.get_logits()[offset * cols : rows * cols]`
			`# Update n_tokens`
			`self.n_tokens += n_tokens`


			`for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:`
			`if lib is not None:`
			`lib.Llama.eval = eval_with_progress`