mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 05:17:21 +01:00
convert : fix Baichuan2 models by using vocab size in config.json (#3299)
Use local GGUF package when possible in Baichuan converter
This commit is contained in:
parent
beabc8cfb0
commit
019ba1dcd0
@ -11,11 +11,14 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
import itertools
|
import itertools
|
||||||
import gguf
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import TypeAlias
|
from typing import TypeAlias
|
||||||
@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
|
|||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||||
|
vocab_size = hparams.get('vocab_size')
|
||||||
|
if vocab_size is None:
|
||||||
|
vocab_size = tokenizer.vocab_size()
|
||||||
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(vocab_size):
|
||||||
text: bytes
|
text: bytes
|
||||||
score: float
|
score: float
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user