convert : skip unaccessible HF repos (#7210)

2024-12-25 13:58:46 +01:00 · 2024-05-11 10:18:35 +02:00 · 2024-05-11 10:18:35 +02:00 · 3292733f95
commit 3292733f95
parent 988631335a
1 changed files with 20 additions and 2 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -145,8 +145,17 @@ for model in models:
    if tokt == TOKENIZER_TYPE.SPM:
        continue
    # Skip if the tokenizer folder does not exist or there are other download issues previously
    if not os.path.exists(f"models/tokenizers/{name}"):
        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
        continue
    # create the tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded
    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -287,8 +296,17 @@ for model in models:
    name = model["name"]
    tokt = model["tokt"]
    # Skip if the tokenizer folder does not exist or there are other download issues previously
    if not os.path.exists(f"models/tokenizers/{name}"):
        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
        continue
    # create the tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests: