Disable half2 for ExLlama when using HIP (#2912)

2024-12-25 05:48:55 +01:00 · 2023-06-29 18:03:16 +00:00 · 2023-06-29 18:03:16 +00:00 · 3c076c3c80
commit 3c076c3c80
parent ac0f96e785
2 changed files with 13 additions and 0 deletions
--- a/modules/exllama.py
+++ b/modules/exllama.py
@ -1,6 +1,8 @@
 import sys
 from pathlib import Path

+from torch import version as torch_version
+
 from modules import shared
 from modules.logging_colors import logger

@ -51,6 +53,12 @@ class ExllamaModel:
        if shared.args.gpu_split:
            config.set_auto_map(shared.args.gpu_split)
            config.gpu_peer_fix = True
+        if torch_version.hip:
+            config.rmsnorm_no_half2 = True
+            config.rope_no_half2 = True
+            config.matmul_no_half2 = True
+            config.silu_no_half2 = True
+

        model = ExLlama(config)
        tokenizer = ExLlamaTokenizer(str(tokenizer_model_path))
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@ -97,6 +97,11 @@ class ExllamaHF(PreTrainedModel):
        if shared.args.gpu_split:
            config.set_auto_map(shared.args.gpu_split)
            config.gpu_peer_fix = True
+        if torch.version.hip:
+            config.rmsnorm_no_half2 = True
+            config.rope_no_half2 = True
+            config.matmul_no_half2 = True
+            config.silu_no_half2 = True

        # This slowes down a bit but align better with autogptq generation.
        # TODO: Should give user choice to tune the exllama config