convert-llama-h5-to-gguf.py : load model in parts to save memory

This commit is contained in:
klosax 2023-08-13 12:20:02 +02:00 committed by GitHub
parent e3d1f07eb1
commit 17800cd80f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,4 @@
# Quick and dirty HF llama --> gguf conversion, GQA/70b wont work # HF llama --> gguf conversion, GQA/70b not supported
import gguf import gguf
import gguf_tensor_map as tmap import gguf_tensor_map as tmap
@ -9,7 +9,7 @@ import json
import numpy as np import numpy as np
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
from transformers import AutoModelForCausalLM import torch
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -22,6 +22,15 @@ def permute(weights: NDArray, n_head: int) -> NDArray:
.swapaxes(1, 2) .swapaxes(1, 2)
.reshape(weights.shape)) .reshape(weights.shape))
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
@ -60,8 +69,8 @@ if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0] ) print("Model architecture not supported: " + hparams["architectures"][0] )
sys.exit() sys.exit()
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) # get number of model parts
list_vars = model.state_dict() num_parts = count_model_parts(dir_model)
gguf_writer = gguf.GGUFWriter.open(fname_out) gguf_writer = gguf.GGUFWriter.open(fname_out)
@ -164,41 +173,62 @@ tensor_map = tmap.get_tensor_map(block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
# we don't need these if num_parts == 0:
if name.endswith(".rotary_emb.inv_freq"): part_names = ("pytorch_model.bin",)
continue else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
# permute these for part_name in part_names:
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): print("gguf: loading model part '"+ part_name + "'")
data = permute(data,head_count) model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
# map tensor names for name in model_part.keys():
if name.endswith(".weight") and name[:-7] in tensor_map: data = model_part[name]
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape) # we don't need these
data_dtype = data.dtype if name.endswith(".rotary_emb.inv_freq"):
continue
# print( name + " dims " + str(n_dims) + " dtype " + str(data.dtype) )
if data.dtype != np.float16 and data.dtype != np.float32:
# convert any unsupported data types to float32 # convert any unsupported data types to float32
data_dtype = np.float32 if data.dtype != torch.float16 and data.dtype != torch.float32:
elif ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.to(torch.float32)
data = data.squeeze().numpy()
# permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = permute(data,head_count)
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data_dtype = np.float32
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data_dtype = np.float32
# if f16 desired, convert any float32 2-dim weight tensors to float16 # if f16 desired, convert any float32 2-dim weight tensors to float16
data_dtype = np.float16 if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data_dtype = np.float16
data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4 data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes) gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
print("gguf: write header") print("gguf: write header")
@ -211,28 +241,63 @@ gguf_writer.write_ti_data_to_file()
# tensor data # tensor data
print("gguf: convert and write tensor data") print("gguf: convert and write tensor data")
for name in list_vars.keys(): if num_parts == 0:
data = list_vars[name].squeeze().numpy() part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
# we don't need these for part_name in part_names:
if name.endswith(".rotary_emb.inv_freq"): print("gguf: loading model part '"+ part_name + "'")
continue model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
# permute these for name in model_part.keys():
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): data = model_part[name]
data = permute(data, head_count)
n_dims = len(data.shape) old_dtype = data.dtype
data_dtype = data.dtype
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
if data_dtype != np.float16 and data_dtype != np.float32:
# convert any unsupported data types to float32 # convert any unsupported data types to float32
data = data.astype(np.float32) if data.dtype != torch.float16 and data.dtype != torch.float32:
elif ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.to(torch.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
data = data.astype(np.float16)
gguf_writer.write_tensor_to_file(data) data = data.squeeze().numpy()
# permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = permute(data, head_count)
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data)
gguf_writer.close() gguf_writer.close()