mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 01:09:22 +01:00
Small fixes
This commit is contained in:
parent
2a1063eff5
commit
8a6d9abb41
@ -47,6 +47,10 @@ Examples:
|
|||||||
* **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed.
|
* **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed.
|
||||||
* **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much).
|
* **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much).
|
||||||
|
|
||||||
|
### ExLlamav2
|
||||||
|
|
||||||
|
The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library.
|
||||||
|
|
||||||
### AutoGPTQ
|
### AutoGPTQ
|
||||||
|
|
||||||
Loads: GPTQ models.
|
Loads: GPTQ models.
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
| llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF |
|
| llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF |
|
||||||
| llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ |
|
| llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ |
|
||||||
| ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ |
|
| ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||||
|
| ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF |
|
||||||
| AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
| AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||||
| AutoAWQ | ? | ❌ | ? | ? | ✅ |
|
| AutoAWQ | ? | ❌ | ? | ? | ✅ |
|
||||||
| GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ |
|
| GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ |
|
||||||
|
@ -50,7 +50,6 @@ settings = {
|
|||||||
'prompt_lookup_num_tokens': 0,
|
'prompt_lookup_num_tokens': 0,
|
||||||
'custom_stopping_strings': '',
|
'custom_stopping_strings': '',
|
||||||
'custom_token_bans': '',
|
'custom_token_bans': '',
|
||||||
'sampler_priority': 'temperature,top_k,top_p,typical_p,epsilon_cutoff,eta_cutoff,tfs,top_a,min_p,dynamic_temperature,quadratic_sampling,mirostat',
|
|
||||||
'auto_max_new_tokens': False,
|
'auto_max_new_tokens': False,
|
||||||
'ban_eos_token': False,
|
'ban_eos_token': False,
|
||||||
'add_bos_token': True,
|
'add_bos_token': True,
|
||||||
@ -130,7 +129,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
|
|||||||
group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
|
group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
|
||||||
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||||
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
|
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
|
||||||
group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.')
|
group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
||||||
|
|
||||||
# ExLlama
|
# ExLlama
|
||||||
group = parser.add_argument_group('ExLlama')
|
group = parser.add_argument_group('ExLlama')
|
||||||
|
Loading…
Reference in New Issue
Block a user