hooks : setting up flake8 and pre-commit hooks (#1681)

Small, non-functional changes were made to non-compliant files.
These include breaking up long lines, whitespace sanitation and
unused import removal.

Maximum line length in python files was set to a generous 125 chars,
in order to minimize number of changes needed in scripts and general
annoyance. The "txt" prompts directory is excluded from the checks
as it may contain oddly formatted files and strings for a good reason.

Signed-off-by: Jiri Podivin <jpodivin@gmail.com>
This commit is contained in:
Jiří Podivín 2023-06-17 12:32:48 +02:00 committed by GitHub
parent bac19927c3
commit 5ddf7ea1fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 42 additions and 12 deletions

2
.flake8 Normal file
View File

@ -0,0 +1,2 @@
[flake8]
max-line-length = 125

15
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,15 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8

View File

@ -512,7 +512,11 @@ class LazyTensor:
if not isinstance(self.data_type, QuantizedDataType): if not isinstance(self.data_type, QuantizedDataType):
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
if self.data_type.have_g_idx: if self.data_type.have_g_idx:
sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n") sys.stderr.write(
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
"which is not yet natively supported by GGML. "
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
"but that will result in a much larger output file for no quality benefit.\n")
sys.exit(1) sys.exit(1)
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
@ -695,7 +699,8 @@ class LazyUnpickler(pickle.Unpickler):
return LazyStorage(load=load, kind=pid[1], description=description) return LazyStorage(load=load, kind=pid[1], description=description)
# @staticmethod # @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage) assert isinstance(storage, LazyStorage)
@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
files = list(path.glob("model-00001-of-*.safetensors")) files = list(path.glob("model-00001-of-*.safetensors"))
if not files: if not files:
# Try the PyTorch patterns too, with lower priority # Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ] globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)] files = [file for glob in globs for file in path.glob(glob)]
if not files: if not files:
# Try GGML too, but with lower priority, since if both a non-GGML # Try GGML too, but with lower priority, since if both a non-GGML
@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
elif path3.exists(): elif path3.exists():
path = path3 path = path3
else: else:
raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir") raise FileNotFoundError(
f"Could not find tokenizer.model in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
print(f"Loading vocab file {path}") print(f"Loading vocab file {path}")
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
}[params.file_type] }[params.file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
if ret in model_paths: if ret in model_paths:
sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n") sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n")
sys.exit(1) sys.exit(1)
return ret return ret
@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("model", type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
vocab: Vocab vocab: Vocab

View File

@ -1,5 +1,5 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import sys, os import os
import csv import csv
labels = [] labels = []
@ -8,6 +8,7 @@ numEntries = 1
rows = [] rows = []
def bar_chart(numbers, labels, pos): def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue') plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels) plt.xticks(ticks=pos, labels=labels)
@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
plt.ylabel("Questions Correct") plt.ylabel("Questions Correct")
plt.show() plt.show()
def calculatecorrect(): def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/") directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',') csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
@ -38,14 +40,13 @@ def calculatecorrect():
print(line) print(line)
else: else:
print("Correct answer: " + rows[i][2] + "\n") print("Correct answer: " + rows[i][2] + "\n")
i+=1 i += 1
print("Did the AI get the question right? (y/n)") print("Did the AI get the question right? (y/n)")
if input() == "y": if input() == "y":
totalcorrect += 1 totalcorrect += 1
numbers.append(totalcorrect) numbers.append(totalcorrect)
if __name__ == '__main__': if __name__ == '__main__':
calculatecorrect() calculatecorrect()
pos = list(range(numEntries)) pos = list(range(numEntries))

View File

@ -1,6 +1,7 @@
import os import os
import hashlib import hashlib
def sha256sum(file): def sha256sum(file):
block_size = 16 * 1024 * 1024 # 16 MB block size block_size = 16 * 1024 * 1024 # 16 MB block size
b = bytearray(block_size) b = bytearray(block_size)
@ -15,6 +16,7 @@ def sha256sum(file):
return file_hash.hexdigest() return file_hash.hexdigest()
# Define the path to the llama directory (parent folder of script directory) # Define the path to the llama directory (parent folder of script directory)
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))