import datetime import traceback from pathlib import Path import pandas as pd import torch from datasets import load_dataset from tqdm import tqdm from modules import shared from modules.models import load_model, unload_model from modules.text_generation import encode from server import get_model_specific_settings, update_model_parameters def load_past_evaluations(): if Path('logs/evaluations.csv').exists(): df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str) df['Perplexity'] = pd.to_numeric(df['Perplexity']) return df else: return pd.DataFrame(columns=['Model', 'LoRAs', 'Dataset', 'Perplexity', 'stride', 'max_length', 'Date', 'Comment']) past_evaluations = load_past_evaluations() def save_past_evaluations(df): global past_evaluations past_evaluations = df df.to_csv(Path('logs/evaluations.csv'), index=False) def calculate_perplexity(models, input_dataset, stride, _max_length): ''' Based on: https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models ''' global past_evaluations cumulative_log = '' cumulative_log += "Loading the input dataset...\n" yield cumulative_log # Copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/utils/datautils.py if input_dataset == 'wikitext': data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') text = "\n\n".join(data['text']) elif input_dataset == 'ptb': data = load_dataset('ptb_text_only', 'penn_treebank', split='validation') text = "\n\n".join(data['sentence']) elif input_dataset == 'ptb_new': data = load_dataset('ptb_text_only', 'penn_treebank', split='test') text = " ".join(data['sentence']) else: with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f: text = f.read() for model in models: if is_in_past_evaluations(model, input_dataset, stride, _max_length): cumulative_log += f"{model} has already been tested. Ignoring.\n" yield cumulative_log continue if model != 'current model': try: yield cumulative_log + f"Loading {model}...\n" model_settings = get_model_specific_settings(model) shared.settings.update(model_settings) # hijacking the interface defaults update_model_parameters(model_settings) # hijacking the command-line arguments shared.model_name = model unload_model() shared.model, shared.tokenizer = load_model(shared.model_name) except: cumulative_log += f"Failed to load {model}. Moving on.\n" yield cumulative_log continue cumulative_log += f"Processing {model}...\n" yield cumulative_log + "Tokenizing the input dataset...\n" encodings = encode(text, add_special_tokens=False) seq_len = encodings.shape[1] max_length = _max_length or shared.model.config.max_position_embeddings nlls = [] prev_end_loc = 0 for begin_loc in tqdm(range(0, seq_len, stride)): yield cumulative_log + f"Evaluating... {100*begin_loc/seq_len:.2f}%" end_loc = min(begin_loc + max_length, seq_len) trg_len = end_loc - prev_end_loc # may be different from stride on last loop input_ids = encodings[:, begin_loc:end_loc] target_ids = input_ids.clone() target_ids[:, :-trg_len] = -100 with torch.no_grad(): outputs = shared.model(input_ids, labels=target_ids) # loss is calculated using CrossEntropyLoss which averages over valid labels # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels # to the left by 1. neg_log_likelihood = outputs.loss nlls.append(neg_log_likelihood) prev_end_loc = end_loc if end_loc == seq_len: break ppl = torch.exp(torch.stack(nlls).mean()) add_entry_to_past_evaluations(float(ppl), shared.model_name, input_dataset, stride, _max_length) save_past_evaluations(past_evaluations) cumulative_log += f"Done. The perplexity is: {float(ppl)}\n\n" yield cumulative_log def add_entry_to_past_evaluations(perplexity, model, dataset, stride, max_length): global past_evaluations entry = { 'Model': model, 'LoRAs': ', '.join(shared.lora_names) or '-', 'Dataset': dataset, 'Perplexity': perplexity, 'stride': str(stride), 'max_length': str(max_length), 'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Comment': '' } past_evaluations = pd.concat([past_evaluations, pd.DataFrame([entry])], ignore_index=True) def is_in_past_evaluations(model, dataset, stride, max_length): entries = past_evaluations[(past_evaluations['Model'] == model) & (past_evaluations['Dataset'] == dataset) & (past_evaluations['max_length'] == str(max_length)) & (past_evaluations['stride'] == str(stride))] if entries.shape[0] > 0: return True else: return False def generate_markdown_table(): sorted_df = past_evaluations.sort_values(by=['Dataset', 'stride', 'Perplexity', 'Date']) return sorted_df