mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 06:39:25 +01:00
*.py: Stylistic adjustments for python (#8233)
* Superflous parens in conditionals were removed. * Unused args in function were removed. * Replaced unused `idx` var with `_` * Initializing file_format and format_version attributes * Renaming constant to capitals * Preventing redefinition of the `f` var Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
This commit is contained in:
parent
6f11a83e4e
commit
566daa5a5b
@ -737,7 +737,7 @@ class Model:
|
|||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -2005,7 +2005,7 @@ class Phi3MiniModel(Model):
|
|||||||
|
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -2081,7 +2081,7 @@ class Phi3MiniModel(Model):
|
|||||||
|
|
||||||
# write rope scaling for long context (128k) model
|
# write rope scaling for long context (128k) model
|
||||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
if (rope_scaling is None):
|
if rope_scaling is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
scale = max_pos_embds / orig_max_pos_embds
|
scale = max_pos_embds / orig_max_pos_embds
|
||||||
@ -2728,7 +2728,7 @@ class JinaBertV2Model(BertModel):
|
|||||||
|
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
def set_vocab(self, *args, **kwargs):
|
def set_vocab(self):
|
||||||
tokenizer_class = 'BertTokenizer'
|
tokenizer_class = 'BertTokenizer'
|
||||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_class = json.load(f)['tokenizer_class']
|
tokenizer_class = json.load(f)['tokenizer_class']
|
||||||
@ -2876,7 +2876,7 @@ class ArcticModel(Model):
|
|||||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
for token_id, token_json in added_tokens_decoder.items():
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -3125,7 +3125,7 @@ class T5Model(Model):
|
|||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
|||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
@ -100,8 +100,8 @@ def download_file_with_auth(url, token, save_path):
|
|||||||
response = sess.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as downloaded_file:
|
||||||
f.write(response.content)
|
downloaded_file.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
|
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ for model in models:
|
|||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
logger.info(f"model: {name}")
|
logger.info(f"model: {name}")
|
||||||
@ -192,7 +192,7 @@ src_func = f"""
|
|||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = {repr(chktxt)}
|
chktxt = {repr(CHK_TXT)}
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(chktxt)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
@ -288,7 +288,7 @@ tests = [
|
|||||||
"333333333",
|
"333333333",
|
||||||
"Cửa Việt", # llama-bpe fails on this
|
"Cửa Việt", # llama-bpe fails on this
|
||||||
" discards",
|
" discards",
|
||||||
chktxt,
|
CHK_TXT,
|
||||||
]
|
]
|
||||||
|
|
||||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||||
|
@ -132,6 +132,10 @@ class Tensor:
|
|||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
|
|
||||||
|
file_format: GGMLFormat
|
||||||
|
format_version: int
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
@ -290,7 +294,7 @@ class GGMLToGGUF:
|
|||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
logger.info('* Adding vocab item(s)')
|
logger.info('* Adding vocab item(s)')
|
||||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
|
Loading…
Reference in New Issue
Block a user