Updated Metadata Override (markdown)

Brian 2024-11-18 01:07:44 +11:00
parent 584c3525b5
commit eb98a44bb0

@ -31,19 +31,55 @@ https://github.com/ggerganov/llama.cpp/pull/7499 was added which adds `--metadat
// Model Parents (Merges, Pre-tuning, etc...) // Model Parents (Merges, Pre-tuning, etc...)
"general.base_models" : [ "general.base_models" : [
{ {
"name" : "base model example" , "name": "GPT-3",
"author" : "example parent" , "author": "OpenAI",
"version" : "v3.2" , "version": "3.0",
"organization" : "grandOldMaster" , "organization": "OpenAI",
"url" : "https://huggingface.co/SparkExampleMind/parentlalama-1Q-v1.0-safetensor/blob/main/README.md", "description": "A large language model capable of performing a wide variety of language tasks.",
"doi" : "doi:10.1080/02626667.2018.1560449", "url": "https://openai.com/research/gpt-3",
"uuid" : "52d8c7ef-1de5-43f1-87a4-0c7c9c3d07c4" , "doi": "10.5555/gpt3doi123456",
"repo_url" : "https://huggingface.co/SparkExampleMind/parentlalama-1Q-v1.0-safetensor" "uuid": "123e4567-e89b-12d3-a456-426614174000",
"repo_url": "https://github.com/openai/gpt-3"
},
{
"name": "BERT",
"author": "Google AI Language",
"version": "1.0",
"organization": "Google",
"description": "A transformer-based model pretrained on English to achieve state-of-the-art performance on a range of NLP tasks.",
"url": "https://github.com/google-research/bert",
"doi": "10.5555/bertdoi789012",
"uuid": "987e6543-e21a-43f3-a356-527614173999",
"repo_url": "https://github.com/google-research/bert"
} }
], ],
// Array based metadata // Model Datasets Used (Training data...)
"general.datasets": [
{
"name": "Wikipedia Corpus",
"author": "Wikimedia Foundation",
"version": "2021-06",
"organization": "Wikimedia",
"description": "A dataset comprising the full English Wikipedia, used to train models in a range of natural language tasks.",
"url": "https://dumps.wikimedia.org/enwiki/",
"doi": "10.5555/wikidoi234567",
"uuid": "234e5678-f90a-12d3-c567-426614172345",
"repo_url": "https://github.com/wikimedia/wikipedia-corpus"
},
{
"name": "Common Crawl",
"author": "Common Crawl Foundation",
"version": "2021-04",
"organization": "Common Crawl",
"description": "A dataset containing web-crawled data from various domains, providing a broad range of text.",
"url": "https://commoncrawl.org",
"doi": "10.5555/ccdoi345678",
"uuid": "345e6789-f90b-34d5-d678-426614173456",
"repo_url": "https://github.com/commoncrawl/cc-crawl-data"
}
],
// Array Based Metadata
"general.tags": ["text generation", "transformer", "llama", "tiny", "tiny model"], "general.tags": ["text generation", "transformer", "llama", "tiny", "tiny model"],
"general.languages": ["en"], "general.languages": ["en"]
"general.datasets": ["https://huggingface.co/datasets/roneneldan/TinyStories/blob/main/TinyStoriesV2-GPT4-train.txt", "https://huggingface.co/datasets/roneneldan/TinyStories/blob/main/TinyStoriesV2-GPT4-valid.txt"]
} }
``` ```