Updated Metadata Override (markdown)

2024-11-22 00:07:58 +01:00 · 2024-11-18 22:22:27 +11:00 · 2024-11-18 22:22:27 +11:00 · 1817048cfc
commit 1817048cfc
parent eb98a44bb0
1 changed files with 85 additions and 0 deletions
--- a/Metadata-Override.md
+++ b/Metadata-Override.md
@ -82,4 +82,89 @@ https://github.com/ggerganov/llama.cpp/pull/7499 was added which adds `--metadat
    "general.tags": ["text generation", "transformer", "llama", "tiny", "tiny model"],
    "general.languages": ["en"]
 }
+```
+
+As for how this may corresponds with Hugging Face style model cards... consider:
+
+```yaml
+# Model Card Fields
+model_name: Example Model Six
+model_author: John Smith
+model_version: v1.0
+model_organization: SparkExampleMind
+quantized_by: Abbety Jenson
+model_description: This is an example of a model
+# Useful for cleanly regenerating default naming conventions
+model_finetune: instruct
+model_basename: llamabase
+model_size_label: 8x2.3Q
+# Licensing details
+license: apache-2.0
+license_name: 'Apache License Version 2.0, January 2004'
+license_link: 'https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md'
+# Typically represents the converted GGUF repo (Unless native)
+model_url: 'https://huggingface.co/SparkExampleMind/llamabase-8x2.3Q-instruct-v1.0-F16/blob/main/README.md'
+model_doi: 'doi:10.1080/02626667.2018.1560449'
+model_uuid: f18383df-ceb9-4ef3-b929-77e4dc64787c
+model_repo_url: 'https://huggingface.co/SparkExampleMind/llamabase-8x2.3Q-instruct-v1.0-F16'
+# Model Source If Conversion
+source_model_url: 'https://huggingface.co/SparkExampleMind/llamabase-8x2.3Q-instruct-v1.0-safetensor/blob/main/README.md'
+source_model_doi: 'doi:10.1080/02626667.2018.1560449'
+source_model_uuid: 'a72998bf-3b84-4ff4-91c6-7a6b780507bc'
+source_model_repo_url: 'https://huggingface.co/SparkExampleMind/llamabase-8x2.3Q-instruct-v1.0-safetensor'
+# Model Parents (Merges, Pre-tuning, etc...)
+base_model_sources:
+  - name: GPT-3
+    author: OpenAI
+    version: '3.0'
+    organization: OpenAI
+    description: >-
+      A large language model capable of performing a wide variety of language
+      tasks.
+    url: 'https://openai.com/research/gpt-3'
+    doi: 10.5555/gpt3doi123456
+    uuid: 123e4567-e89b-12d3-a456-426614174000
+    repo_url: 'https://github.com/openai/gpt-3'
+  - name: BERT
+    author: Google AI Language
+    version: '1.0'
+    organization: Google
+    description: >-
+      A transformer-based model pretrained on English to achieve
+      state-of-the-art performance on a range of NLP tasks.
+    url: 'https://github.com/google-research/bert'
+    doi: 10.5555/bertdoi789012
+    uuid: 987e6543-e21a-43f3-a356-527614173999
+    repo_url: 'https://github.com/google-research/bert'
+# Model Datasets Used (Training data...)
+dataset_sources:
+  - name: Wikipedia Corpus
+    author: Wikimedia Foundation
+    version: 2021-06
+    organization: Wikimedia
+    description: A dataset comprising the full English Wikipedia, used to train models in a range of natural language tasks.
+    url: 'https://dumps.wikimedia.org/enwiki/'
+    doi: 10.5555/wikidoi234567
+    uuid: 234e5678-f90a-12d3-c567-426614172345
+    repo_url: 'https://github.com/wikimedia/wikipedia-corpus'
+  - name: Common Crawl
+    author: Common Crawl Foundation
+    version: 2021-04
+    organization: Common Crawl
+    description: A dataset containing web-crawled data from various domains, providing a broad range of text.
+    url: 'https://commoncrawl.org'
+    doi: 10.5555/ccdoi345678
+    uuid: 345e6789-f90b-34d5-d678-426614173456
+    repo_url: 'https://github.com/commoncrawl/cc-crawl-data'
+# Model Content Metadata
+tags:
+  - text generation
+  - transformer
+  - llama
+  - tiny
+  - tiny model
+pipeline_tag:
+  - text-classification
+languages:
+  - en
 ```