convert : fix Baichuan2 models by using vocab size in config.json (#3299)

Use local GGUF package when possible in Baichuan converter
This commit is contained in:
Kerfuffle 2023-10-04 08:20:28 -06:00 committed by GitHub
parent beabc8cfb0
commit 019ba1dcd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,11 +11,14 @@ import sys
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
import itertools import itertools
import gguf
import numpy as np import numpy as np
import torch import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import] from sentencepiece import SentencePieceProcessor # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
if TYPE_CHECKING: if TYPE_CHECKING:
from typing import TypeAlias from typing import TypeAlias
@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
print("gguf: get sentencepiece tokenizer vocab, scores and token types") print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
vocab_size = hparams.get('vocab_size')
if vocab_size is None:
vocab_size = tokenizer.vocab_size()
for i in range(tokenizer.vocab_size()): for i in range(vocab_size):
text: bytes text: bytes
score: float score: float