Added UnicodeDecodeError workaround for modules/llamacpp_model.py (#6040)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
This commit is contained in:
nclok1405 2025-01-09 09:17:31 +09:00 committed by GitHub
parent 91a8a87887
commit b9e2ded6d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -122,7 +122,14 @@ class LlamaCppModel:
return self.model.tokenize(string) return self.model.tokenize(string)
def decode(self, ids, **kwargs): def decode(self, ids, **kwargs):
return self.model.detokenize(ids).decode('utf-8') detokenized = self.model.detokenize(ids)
try:
# Attempt strict UTF-8 decoding first
return detokenized.decode('utf-8', 'strict')
except UnicodeDecodeError as e:
# Log the error and fall back to UTF-8 with replacement
logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}")
return detokenized.decode('utf-8', 'replace')
def get_logits(self, tokens): def get_logits(self, tokens):
self.model.reset() self.model.reset()