From b9e2ded6d4ea32ceccea87d1f0a37ed003f0316e Mon Sep 17 00:00:00 2001 From: nclok1405 <155463060+nclok1405@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:17:31 +0900 Subject: [PATCH] Added UnicodeDecodeError workaround for modules/llamacpp_model.py (#6040) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- modules/llamacpp_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 6a76ee4e..c79755e4 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -122,7 +122,14 @@ class LlamaCppModel: return self.model.tokenize(string) def decode(self, ids, **kwargs): - return self.model.detokenize(ids).decode('utf-8') + detokenized = self.model.detokenize(ids) + try: + # Attempt strict UTF-8 decoding first + return detokenized.decode('utf-8', 'strict') + except UnicodeDecodeError as e: + # Log the error and fall back to UTF-8 with replacement + logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}") + return detokenized.decode('utf-8', 'replace') def get_logits(self, tokens): self.model.reset()