mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 21:37:19 +01:00
commit
8bd47ce5d6
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
name: Low Severity Bugs
|
||||||
|
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "low severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./main --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
name: Medium Severity Bug
|
||||||
|
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "medium severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./main --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
name: High Severity Bug
|
||||||
|
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "high severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./main --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
name: Critical Severity Bug
|
||||||
|
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "critical severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./main --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
name: Enhancement
|
||||||
|
description: Used to request enhancements for llama.cpp
|
||||||
|
title: "Feature Request: "
|
||||||
|
labels: ["enhancement"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: prerequisites
|
||||||
|
attributes:
|
||||||
|
label: Prerequisites
|
||||||
|
description: Please confirm the following before submitting your enhancement request.
|
||||||
|
options:
|
||||||
|
- label: I am running the latest code. Mention the version if possible as well.
|
||||||
|
required: true
|
||||||
|
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||||
|
required: true
|
||||||
|
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
required: true
|
||||||
|
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: feature-description
|
||||||
|
attributes:
|
||||||
|
label: Feature Description
|
||||||
|
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||||
|
placeholder: Detailed description of the enhancement
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: motivation
|
||||||
|
attributes:
|
||||||
|
label: Motivation
|
||||||
|
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||||
|
placeholder: Explanation of why this feature is needed and its benefits
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-implementation
|
||||||
|
attributes:
|
||||||
|
label: Possible Implementation
|
||||||
|
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||||
|
placeholder: Detailed description of potential implementation
|
||||||
|
validations:
|
||||||
|
required: false
|
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
name: Question
|
||||||
|
description: Used to ask questions about llama.cpp
|
||||||
|
title: "Question: "
|
||||||
|
labels: ["question"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
[Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: prerequisites
|
||||||
|
attributes:
|
||||||
|
label: Prerequisites
|
||||||
|
description: Please confirm the following before submitting your question.
|
||||||
|
options:
|
||||||
|
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
required: true
|
||||||
|
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background-description
|
||||||
|
attributes:
|
||||||
|
label: Background Description
|
||||||
|
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
|
||||||
|
placeholder: Detailed description of your question
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-answer
|
||||||
|
attributes:
|
||||||
|
label: Possible Answer
|
||||||
|
description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
|
||||||
|
placeholder: Your idea of possible answers
|
||||||
|
validations:
|
||||||
|
required: false
|
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
name: Refactor (Maintainers)
|
||||||
|
description: Used to track refactoring opportunities
|
||||||
|
title: "Refactor: "
|
||||||
|
labels: ["refactor"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||||
|
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background-description
|
||||||
|
attributes:
|
||||||
|
label: Background Description
|
||||||
|
description: Please provide a detailed written description of the pain points you are trying to solve.
|
||||||
|
placeholder: Detailed description behind your motivation to request refactor
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-approaches
|
||||||
|
attributes:
|
||||||
|
label: Possible Refactor Approaches
|
||||||
|
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
||||||
|
placeholder: Your idea of possible refactoring opportunity/approaches
|
||||||
|
validations:
|
||||||
|
required: false
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
11
.github/ISSUE_TEMPLATE/bug.md
vendored
@ -1,11 +0,0 @@
|
|||||||
---
|
|
||||||
name: Bug template
|
|
||||||
about: Used to report bugs in llama.cpp
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
|
|
||||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
@ -1,28 +0,0 @@
|
|||||||
---
|
|
||||||
name: Enhancement template
|
|
||||||
about: Used to request enhancements for llama.cpp
|
|
||||||
labels: ["enhancement"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Prerequisites
|
|
||||||
|
|
||||||
Please answer the following questions for yourself before submitting an issue.
|
|
||||||
|
|
||||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
|
||||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
|
||||||
|
|
||||||
# Feature Description
|
|
||||||
|
|
||||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
|
|
||||||
# Motivation
|
|
||||||
|
|
||||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
|
|
||||||
# Possible Implementation
|
|
||||||
|
|
||||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
14
.github/labeler.yml
vendored
14
.github/labeler.yml
vendored
@ -1,5 +1,16 @@
|
|||||||
# https://github.com/actions/labeler
|
# https://github.com/actions/labeler
|
||||||
|
Kompute:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-kompute.h
|
||||||
|
- ggml-kompute.cpp
|
||||||
|
- README-kompute.md
|
||||||
|
Apple Metal:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-metal.h
|
||||||
|
- ggml-metal.cpp
|
||||||
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
@ -9,6 +20,7 @@ SYCL:
|
|||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
- ggml-cuda.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
@ -42,8 +42,9 @@ jobs:
|
|||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
||||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
@ -72,6 +72,7 @@ else()
|
|||||||
set(INS_ENB ON)
|
set(INS_ENB ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||||
@ -1040,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
|||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_SVE)
|
||||||
|
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
{
|
{
|
||||||
"version": 4,
|
"version": 4,
|
||||||
"configurePresets": [
|
"configurePresets": [
|
||||||
{
|
{
|
||||||
@ -40,6 +40,10 @@
|
|||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||||
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
3
Makefile
3
Makefile
@ -441,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
endif # LLAMA_DEBUG
|
endif # LLAMA_DEBUG
|
||||||
|
ifdef LLAMA_CUDA_DEBUG
|
||||||
|
MK_NVCCFLAGS += --device-debug
|
||||||
|
endif # LLAMA_CUDA_DEBUG
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
|
@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
|
|||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||||
|
|
||||||
@ -140,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
|
|||||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
|
||||||
**HTTP server**
|
**HTTP server**
|
||||||
|
|
||||||
@ -201,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
|
||||||
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
424
ci/run.sh
424
ci/run.sh
@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||||
if [[ -s $gguf_3b ]]; then
|
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
echo -n "$gguf_3b"
|
if [[ -s $gguf_0 ]]; then
|
||||||
elif [[ -s $gguf_7b ]]; then
|
echo -n "$gguf_0"
|
||||||
echo -n "$gguf_7b"
|
elif [[ -s $gguf_1 ]]; then
|
||||||
|
echo -n "$gguf_1"
|
||||||
|
elif [[ -s $gguf_2 ]]; then
|
||||||
|
echo -n "$gguf_2"
|
||||||
else
|
else
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
exit 1
|
exit 1
|
||||||
@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
|
|||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_3b_v2
|
|
||||||
|
|
||||||
function gg_run_open_llama_3b_v2 {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/3B-v2"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_open_llama_3b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
# requires: GG_BUILD_CUDA
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
|||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
|
|||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# pythia_1.4b
|
||||||
|
|
||||||
|
function gg_run_pythia_1_4b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/1.4B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_1_4b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 1.4B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# pythia_2_8b
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
|
function gg_run_pythia_2_8b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/2.8B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_2_8b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 2.8B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
|
||||||
function gg_run_embd_bge_small {
|
function gg_run_embd_bge_small {
|
||||||
@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
|
|||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
@ -606,10 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
#test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
date # dummy
|
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
params.interactive_specials = true;
|
params.interactive_specials = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--special") {
|
||||||
|
params.special = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--embedding") {
|
if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
return true;
|
return true;
|
||||||
@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" --version show version and build info\n");
|
printf(" --version show version and build info\n");
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
|
printf(" --special special tokens output enabled\n");
|
||||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||||
@ -1855,11 +1860,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|||||||
|
|
||||||
std::string fs_get_cache_directory() {
|
std::string fs_get_cache_directory() {
|
||||||
std::string cache_directory = "";
|
std::string cache_directory = "";
|
||||||
|
auto ensure_trailing_slash = [](std::string p) {
|
||||||
|
// Make sure to add trailing slash
|
||||||
|
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||||
|
p += DIRECTORY_SEPARATOR;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
};
|
||||||
if (getenv("LLAMA_CACHE")) {
|
if (getenv("LLAMA_CACHE")) {
|
||||||
cache_directory = std::getenv("LLAMA_CACHE");
|
cache_directory = std::getenv("LLAMA_CACHE");
|
||||||
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
|
|
||||||
cache_directory += DIRECTORY_SEPARATOR;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if (std::getenv("XDG_CACHE_HOME")) {
|
if (std::getenv("XDG_CACHE_HOME")) {
|
||||||
@ -1870,12 +1879,12 @@ std::string fs_get_cache_directory() {
|
|||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
cache_directory = std::getenv("APPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
#endif // __linux__
|
#endif // __linux__
|
||||||
|
cache_directory = ensure_trailing_slash(cache_directory);
|
||||||
cache_directory += "llama.cpp";
|
cache_directory += "llama.cpp";
|
||||||
cache_directory += DIRECTORY_SEPARATOR;
|
|
||||||
}
|
}
|
||||||
return cache_directory;
|
return ensure_trailing_slash(cache_directory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -2840,6 +2849,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||||
|
@ -146,6 +146,7 @@ struct gpt_params {
|
|||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||||
|
bool special = false; // enable special token output
|
||||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
|
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
|||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_checkpointing = true;
|
params.use_checkpointing = true;
|
||||||
|
|
||||||
params.sample_start = "";
|
params.sample_start = "";
|
||||||
|
@ -81,6 +81,7 @@ models = [
|
|||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -313,11 +313,10 @@ class Model:
|
|||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
|
||||||
)}}}"""
|
|
||||||
|
|
||||||
# n_dims is implicit in the shape
|
# n_dims is implicit in the shape
|
||||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
@ -474,6 +473,9 @@ class Model:
|
|||||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
res = "jina-v2-de"
|
res = "jina-v2-de"
|
||||||
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
|
res = "smaug-bpe"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -2355,7 +2357,8 @@ class CommandR2Model(Model):
|
|||||||
|
|
||||||
# max_position_embeddings = 8192 in config.json but model was actually
|
# max_position_embeddings = 8192 in config.json but model was actually
|
||||||
# trained on 128k context length
|
# trained on 128k context length
|
||||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
# aya-23 models don't have model_max_length specified
|
||||||
|
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
@ -2428,6 +2431,236 @@ class JinaBertV2Model(BertModel):
|
|||||||
self.gguf_writer.add_add_eos_token(True)
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("ArcticForCausalLM")
|
||||||
|
class ArcticModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.ARCTIC
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# The reason for using a custom implementation here is that the
|
||||||
|
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
||||||
|
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
logger.error(f'Error: Missing {tokenizer_path}')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Read the whole vocabulary from the tokenizer.model file
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
# Use the added_tokens_decoder field from tokeniser_config.json as the source
|
||||||
|
# of information about added/redefined tokens and modify them accordingly.
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
|
||||||
|
if "added_tokens_decoder" in tokenizer_config_json:
|
||||||
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
|
token_id = int(token_id)
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
token_content = token_json["content"]
|
||||||
|
token_type = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
token_score = -10000.0
|
||||||
|
|
||||||
|
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||||
|
# Set the score to 0.0 as in the original tokenizer.model
|
||||||
|
if ("special" in token_json) and token_json["special"]:
|
||||||
|
if token_content == tokenizer_config_json["unk_token"]:
|
||||||
|
token_type = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
else:
|
||||||
|
token_type = SentencePieceTokenTypes.CONTROL
|
||||||
|
token_score = 0.0
|
||||||
|
|
||||||
|
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||||
|
tokens[token_id] = token_content.encode("utf-8")
|
||||||
|
toktypes[token_id] = token_type
|
||||||
|
scores[token_id] = token_score
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
|
if name.endswith("q_proj.weight"):
|
||||||
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
|
if name.endswith("k_proj.weight"):
|
||||||
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("block_sparse_moe.experts") != -1:
|
||||||
|
n_experts = self.hparams["num_local_experts"]
|
||||||
|
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for wid in ["w1", "w2", "w3"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
class DeepseekV2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||||
|
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("mlp.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
|||||||
|
|
||||||
params.samples_start_after_nl = false;
|
params.samples_start_after_nl = false;
|
||||||
params.use_adam = true;
|
params.use_adam = true;
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_scratch = true;
|
params.use_scratch = true;
|
||||||
|
|
||||||
// only adam
|
// only adam
|
||||||
|
@ -7,8 +7,6 @@ android {
|
|||||||
namespace = "com.example.llama"
|
namespace = "com.example.llama"
|
||||||
compileSdk = 34
|
compileSdk = 34
|
||||||
|
|
||||||
ndkVersion = "26.1.10909125"
|
|
||||||
|
|
||||||
defaultConfig {
|
defaultConfig {
|
||||||
applicationId = "com.example.llama"
|
applicationId = "com.example.llama"
|
||||||
minSdk = 33
|
minSdk = 33
|
||||||
@ -20,17 +18,6 @@ android {
|
|||||||
vectorDrawables {
|
vectorDrawables {
|
||||||
useSupportLibrary = true
|
useSupportLibrary = true
|
||||||
}
|
}
|
||||||
ndk {
|
|
||||||
// Add NDK properties if wanted, e.g.
|
|
||||||
// abiFilters += listOf("arm64-v8a")
|
|
||||||
}
|
|
||||||
externalNativeBuild {
|
|
||||||
cmake {
|
|
||||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
|
||||||
cppFlags += listOf()
|
|
||||||
arguments += listOf()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
buildTypes {
|
buildTypes {
|
||||||
@ -55,17 +42,6 @@ android {
|
|||||||
composeOptions {
|
composeOptions {
|
||||||
kotlinCompilerExtensionVersion = "1.5.1"
|
kotlinCompilerExtensionVersion = "1.5.1"
|
||||||
}
|
}
|
||||||
packaging {
|
|
||||||
resources {
|
|
||||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
externalNativeBuild {
|
|
||||||
cmake {
|
|
||||||
path = file("src/main/cpp/CMakeLists.txt")
|
|
||||||
version = "3.22.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
@ -78,6 +54,7 @@ dependencies {
|
|||||||
implementation("androidx.compose.ui:ui-graphics")
|
implementation("androidx.compose.ui:ui-graphics")
|
||||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||||
implementation("androidx.compose.material3:material3")
|
implementation("androidx.compose.material3:material3")
|
||||||
|
implementation(project(":llama"))
|
||||||
testImplementation("junit:junit:4.13.2")
|
testImplementation("junit:junit:4.13.2")
|
||||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package com.example.llama
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.llama.cpp.LLamaAndroid
|
||||||
import android.util.Log
|
import android.util.Log
|
||||||
import androidx.compose.runtime.getValue
|
import androidx.compose.runtime.getValue
|
||||||
import androidx.compose.runtime.mutableStateOf
|
import androidx.compose.runtime.mutableStateOf
|
||||||
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
|||||||
import kotlinx.coroutines.flow.catch
|
import kotlinx.coroutines.flow.catch
|
||||||
import kotlinx.coroutines.launch
|
import kotlinx.coroutines.launch
|
||||||
|
|
||||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||||
companion object {
|
companion object {
|
||||||
@JvmStatic
|
@JvmStatic
|
||||||
private val NanosPerSecond = 1_000_000_000.0
|
private val NanosPerSecond = 1_000_000_000.0
|
||||||
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||||||
|
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
llm.unload()
|
llamaAndroid.unload()
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
messages += exc.message!!
|
messages += exc.message!!
|
||||||
}
|
}
|
||||||
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||||||
messages += ""
|
messages += ""
|
||||||
|
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
llm.send(text)
|
llamaAndroid.send(text)
|
||||||
.catch {
|
.catch {
|
||||||
Log.e(tag, "send() failed", it)
|
Log.e(tag, "send() failed", it)
|
||||||
messages += it.message!!
|
messages += it.message!!
|
||||||
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
val start = System.nanoTime()
|
val start = System.nanoTime()
|
||||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||||
val end = System.nanoTime()
|
val end = System.nanoTime()
|
||||||
|
|
||||||
messages += warmupResult
|
messages += warmupResult
|
||||||
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||||||
return@launch
|
return@launch
|
||||||
}
|
}
|
||||||
|
|
||||||
messages += llm.bench(512, 128, 1, 3)
|
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
Log.e(tag, "bench() failed", exc)
|
Log.e(tag, "bench() failed", exc)
|
||||||
messages += exc.message!!
|
messages += exc.message!!
|
||||||
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||||||
fun load(pathToModel: String) {
|
fun load(pathToModel: String) {
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
llm.load(pathToModel)
|
llamaAndroid.load(pathToModel)
|
||||||
messages += "Loaded $pathToModel"
|
messages += "Loaded $pathToModel"
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
Log.e(tag, "load() failed", exc)
|
Log.e(tag, "load() failed", exc)
|
||||||
|
@ -2,4 +2,5 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id("com.android.application") version "8.2.0" apply false
|
id("com.android.application") version "8.2.0" apply false
|
||||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||||
|
id("com.android.library") version "8.2.0" apply false
|
||||||
}
|
}
|
||||||
|
1
examples/llama.android/llama/.gitignore
vendored
Normal file
1
examples/llama.android/llama/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/build
|
68
examples/llama.android/llama/build.gradle.kts
Normal file
68
examples/llama.android/llama/build.gradle.kts
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
plugins {
|
||||||
|
id("com.android.library")
|
||||||
|
id("org.jetbrains.kotlin.android")
|
||||||
|
}
|
||||||
|
|
||||||
|
android {
|
||||||
|
namespace = "android.llama.cpp"
|
||||||
|
compileSdk = 34
|
||||||
|
|
||||||
|
defaultConfig {
|
||||||
|
minSdk = 33
|
||||||
|
|
||||||
|
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||||
|
consumerProguardFiles("consumer-rules.pro")
|
||||||
|
ndk {
|
||||||
|
// Add NDK properties if wanted, e.g.
|
||||||
|
// abiFilters += listOf("arm64-v8a")
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||||
|
cppFlags += listOf()
|
||||||
|
arguments += listOf()
|
||||||
|
|
||||||
|
cppFlags("")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buildTypes {
|
||||||
|
release {
|
||||||
|
isMinifyEnabled = false
|
||||||
|
proguardFiles(
|
||||||
|
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||||
|
"proguard-rules.pro"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
path("src/main/cpp/CMakeLists.txt")
|
||||||
|
version = "3.22.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
compileOptions {
|
||||||
|
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
targetCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
}
|
||||||
|
kotlinOptions {
|
||||||
|
jvmTarget = "1.8"
|
||||||
|
}
|
||||||
|
|
||||||
|
packaging {
|
||||||
|
resources {
|
||||||
|
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation("androidx.core:core-ktx:1.12.0")
|
||||||
|
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||||
|
implementation("com.google.android.material:material:1.11.0")
|
||||||
|
testImplementation("junit:junit:4.13.2")
|
||||||
|
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||||
|
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||||
|
}
|
0
examples/llama.android/llama/consumer-rules.pro
Normal file
0
examples/llama.android/llama/consumer-rules.pro
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# Add project specific ProGuard rules here.
|
||||||
|
# You can control the set of applied configuration files using the
|
||||||
|
# proguardFiles setting in build.gradle.
|
||||||
|
#
|
||||||
|
# For more details, see
|
||||||
|
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||||
|
|
||||||
|
# If your project uses WebView with JS, uncomment the following
|
||||||
|
# and specify the fully qualified class name to the JavaScript interface
|
||||||
|
# class:
|
||||||
|
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||||
|
# public *;
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Uncomment this to preserve the line number information for
|
||||||
|
# debugging stack traces.
|
||||||
|
#-keepattributes SourceFile,LineNumberTable
|
||||||
|
|
||||||
|
# If you keep the line number information, uncomment this to
|
||||||
|
# hide the original source file name.
|
||||||
|
#-renamesourcefileattribute SourceFile
|
@ -0,0 +1,24 @@
|
|||||||
|
package android.llama.cpp
|
||||||
|
|
||||||
|
import androidx.test.platform.app.InstrumentationRegistry
|
||||||
|
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||||
|
|
||||||
|
import org.junit.Test
|
||||||
|
import org.junit.runner.RunWith
|
||||||
|
|
||||||
|
import org.junit.Assert.*
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instrumented test, which will execute on an Android device.
|
||||||
|
*
|
||||||
|
* See [testing documentation](http://d.android.com/tools/testing).
|
||||||
|
*/
|
||||||
|
@RunWith(AndroidJUnit4::class)
|
||||||
|
class ExampleInstrumentedTest {
|
||||||
|
@Test
|
||||||
|
fun useAppContext() {
|
||||||
|
// Context of the app under test.
|
||||||
|
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||||
|
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||||
|
|
||||||
|
</manifest>
|
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# For more information about using CMake with Android Studio, read the
|
||||||
|
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||||
|
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||||
|
|
||||||
|
# Sets the minimum CMake version required for this project.
|
||||||
|
cmake_minimum_required(VERSION 3.22.1)
|
||||||
|
|
||||||
|
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||||
|
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||||
|
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||||
|
# build script scope).
|
||||||
|
project("llama-android")
|
||||||
|
|
||||||
|
include(FetchContent)
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama
|
||||||
|
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||||
|
GIT_TAG master
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also provides "common"
|
||||||
|
FetchContent_MakeAvailable(llama)
|
||||||
|
|
||||||
|
# Creates and names a library, sets it as either STATIC
|
||||||
|
# or SHARED, and provides the relative paths to its source code.
|
||||||
|
# You can define multiple libraries, and CMake builds them for you.
|
||||||
|
# Gradle automatically packages shared libraries with your APK.
|
||||||
|
#
|
||||||
|
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||||
|
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||||
|
# is preferred for the same purpose.
|
||||||
|
#
|
||||||
|
# In order to load a library into your app from Java/Kotlin, you must call
|
||||||
|
# System.loadLibrary() and pass the name of the library defined here;
|
||||||
|
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||||
|
# used in the AndroidManifest.xml file.
|
||||||
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||||
|
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||||
|
llama-android.cpp)
|
||||||
|
|
||||||
|
# Specifies libraries CMake should link to your target library. You
|
||||||
|
# can link libraries from various origins, such as libraries defined in this
|
||||||
|
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||||
|
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||||
|
# List libraries link to the target library
|
||||||
|
llama
|
||||||
|
common
|
||||||
|
android
|
||||||
|
log)
|
@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||||
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||||
|
|
||||||
if (!model) {
|
if (!model) {
|
||||||
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||||
llama_free(reinterpret_cast<llama_context *>(context));
|
llama_free(reinterpret_cast<llama_context *>(context));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||||
llama_log_set(log_callback, NULL);
|
llama_log_set(log_callback, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_bench_1model(
|
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||||
JNIEnv *env,
|
JNIEnv *env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||||
|
|
||||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||||
|
|
||||||
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||||
return env->NewStringUTF(llama_print_system_info());
|
return env->NewStringUTF(llama_print_system_info());
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jint JNICALL
|
JNIEXPORT jint JNICALL
|
||||||
Java_com_example_llama_Llm_completion_1init(
|
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||||
JNIEnv *env,
|
JNIEnv *env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_completion_1loop(
|
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
JNIEnv * env,
|
JNIEnv * env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
|||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||||
}
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package com.example.llama
|
package android.llama.cpp
|
||||||
|
|
||||||
import android.util.Log
|
import android.util.Log
|
||||||
import kotlinx.coroutines.CoroutineDispatcher
|
import kotlinx.coroutines.CoroutineDispatcher
|
||||||
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
|||||||
import java.util.concurrent.Executors
|
import java.util.concurrent.Executors
|
||||||
import kotlin.concurrent.thread
|
import kotlin.concurrent.thread
|
||||||
|
|
||||||
class Llm {
|
class LLamaAndroid {
|
||||||
private val tag: String? = this::class.simpleName
|
private val tag: String? = this::class.simpleName
|
||||||
|
|
||||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||||
@ -165,8 +165,8 @@ class Llm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Enforce only one instance of Llm.
|
// Enforce only one instance of Llm.
|
||||||
private val _instance: Llm = Llm()
|
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||||
|
|
||||||
fun instance(): Llm = _instance
|
fun instance(): LLamaAndroid = _instance
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
package android.llama.cpp
|
||||||
|
|
||||||
|
import org.junit.Test
|
||||||
|
|
||||||
|
import org.junit.Assert.*
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Example local unit test, which will execute on the development machine (host).
|
||||||
|
*
|
||||||
|
* See [testing documentation](http://d.android.com/tools/testing).
|
||||||
|
*/
|
||||||
|
class ExampleUnitTest {
|
||||||
|
@Test
|
||||||
|
fun addition_isCorrect() {
|
||||||
|
assertEquals(4, 2 + 2)
|
||||||
|
}
|
||||||
|
}
|
@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
|||||||
|
|
||||||
rootProject.name = "LlamaAndroid"
|
rootProject.name = "LlamaAndroid"
|
||||||
include(":app")
|
include(":app")
|
||||||
|
include(":llama")
|
||||||
|
@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
|||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
|||||||
// display text
|
// display text
|
||||||
if (input_echo && display) {
|
if (input_echo && display) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||||
printf("%s", token_str.c_str());
|
|
||||||
|
|
||||||
|
// Console/Stream Output
|
||||||
|
fprintf(stdout, "%s", token_str.c_str());
|
||||||
|
|
||||||
|
// Record Displayed Tokens To Log
|
||||||
|
// Note: Generated tokens are created one by one hence this check
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
|
// Incoming Requested Tokens
|
||||||
input_tokens.push_back(id);
|
input_tokens.push_back(id);
|
||||||
} else {
|
} else {
|
||||||
|
// Outgoing Generated Tokens
|
||||||
output_tokens.push_back(id);
|
output_tokens.push_back(id);
|
||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// reset color to default if there is no pending user input
|
// reset color to default if there is no pending user input
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
|
@ -594,7 +594,7 @@
|
|||||||
message = html`<${Probabilities} data=${data} />`
|
message = html`<${Probabilities} data=${data} />`
|
||||||
} else {
|
} else {
|
||||||
const text = isArrayMessage ?
|
const text = isArrayMessage ?
|
||||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
data.map(msg => msg.content).join('') :
|
||||||
data;
|
data;
|
||||||
message = isCompletionMode ?
|
message = isCompletionMode ?
|
||||||
text :
|
text :
|
||||||
@ -877,7 +877,11 @@
|
|||||||
|
|
||||||
// poor mans markdown replacement
|
// poor mans markdown replacement
|
||||||
const Markdownish = (params) => {
|
const Markdownish = (params) => {
|
||||||
const md = params.text
|
const chunks = params.text.split('```');
|
||||||
|
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
if (i % 2 === 0) { // outside code block
|
||||||
|
chunks[i] = chunks[i]
|
||||||
.replace(/&/g, '&')
|
.replace(/&/g, '&')
|
||||||
.replace(/</g, '<')
|
.replace(/</g, '<')
|
||||||
.replace(/>/g, '>')
|
.replace(/>/g, '>')
|
||||||
@ -889,7 +893,14 @@
|
|||||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||||
.replace(/\n/gim, '<br />');
|
.replace(/\n/gim, '<br />');
|
||||||
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
} else { // inside code block
|
||||||
|
chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const restoredText = chunks.join('');
|
||||||
|
|
||||||
|
return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const ModelGenerationInfo = (params) => {
|
const ModelGenerationInfo = (params) => {
|
||||||
@ -903,6 +914,7 @@
|
|||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// simple popover impl
|
// simple popover impl
|
||||||
const Popover = (props) => {
|
const Popover = (props) => {
|
||||||
const isOpen = useSignal(false);
|
const isOpen = useSignal(false);
|
||||||
@ -1054,4 +1066,3 @@
|
|||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
<title>SimpleChat LlamaCppEtal </title>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<meta name="message" content="Save Nature Save Earth" />
|
<meta name="message" content="Save Nature Save Earth" />
|
||||||
@ -30,20 +30,17 @@
|
|||||||
<hr>
|
<hr>
|
||||||
<div class="sameline">
|
<div class="sameline">
|
||||||
<label for="system-in">System</label>
|
<label for="system-in">System</label>
|
||||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<div id="chat-div">
|
<div id="chat-div">
|
||||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
<p> You need to have javascript enabled.</p>
|
||||||
<p> Enter your text to the ai assistant below.</p>
|
|
||||||
<p> Use shift+enter for inserting enter.</p>
|
|
||||||
<p> Refresh the page to start over fresh.</p>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<div class="sameline">
|
<div class="sameline">
|
||||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||||
<button id="user-btn">submit</button>
|
<button id="user-btn">submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -14,11 +14,15 @@ own system prompts.
|
|||||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||||
enough manner, in general.
|
enough manner, in general.
|
||||||
|
|
||||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||||
culling of old messages from the chat.
|
console.
|
||||||
|
|
||||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||||
the js file as needed.
|
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||||
|
form of old messages culling can be achieved.
|
||||||
|
|
||||||
|
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||||
|
they can update the js file or equivalent member in gMe as needed.
|
||||||
|
|
||||||
|
|
||||||
## usage
|
## usage
|
||||||
@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
|||||||
### using the front end
|
### using the front end
|
||||||
|
|
||||||
Open this simple web front end from your local browser
|
Open this simple web front end from your local browser
|
||||||
|
|
||||||
* http://127.0.0.1:PORT/index.html
|
* http://127.0.0.1:PORT/index.html
|
||||||
|
|
||||||
Once inside
|
Once inside
|
||||||
|
|
||||||
* Select between chat and completion mode. By default it is set to chat mode.
|
* Select between chat and completion mode. By default it is set to chat mode.
|
||||||
|
|
||||||
|
* In completion mode
|
||||||
|
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||||
|
If the model requires any prefix wrt user role messages, then the end user has to
|
||||||
|
explicitly add the needed prefix, when they enter their chat message.
|
||||||
|
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||||
|
then the end user needs to enter the same.
|
||||||
|
This keeps the logic simple, while still giving flexibility to the end user to
|
||||||
|
manage any templating/tagging requirement wrt their messages to the model.
|
||||||
|
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||||
|
However if the chat being sent to /completions end point has more than one role's message,
|
||||||
|
then insert newline when moving from one role's message to the next role's message, so
|
||||||
|
that it can be clearly identified/distinguished.
|
||||||
|
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||||
|
own, the above ensures that end user can create a custom single/multi message combo with
|
||||||
|
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||||
|
can use it just for normal completion related/based query.
|
||||||
|
|
||||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||||
|
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||||
|
responses with a suitable system prompt.
|
||||||
* if chat.add_system_begin is used
|
* if chat.add_system_begin is used
|
||||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||||
* you cant set a system prompt, after you have submitted any user query
|
* you cant set a system prompt, after you have submitted any user query
|
||||||
@ -55,27 +81,121 @@ Once inside
|
|||||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||||
|
|
||||||
* Enter your query and either press enter or click on the submit button.
|
* Enter your query and either press enter or click on the submit button.
|
||||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||||
|
|
||||||
* Wait for the logic to communicate with the server and get the response.
|
* Wait for the logic to communicate with the server and get the response.
|
||||||
* the user is not allowed to enter any fresh query during this time.
|
* the user is not allowed to enter any fresh query during this time.
|
||||||
* the user input box will be disabled and a working message will be shown in it.
|
* the user input box will be disabled and a working message will be shown in it.
|
||||||
|
|
||||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||||
|
|
||||||
* Using NewChat one can start independent chat sessions.
|
* Using NewChat one can start independent chat sessions.
|
||||||
* two independent chat sessions are setup by default.
|
* two independent chat sessions are setup by default.
|
||||||
|
|
||||||
|
|
||||||
## Devel note
|
## Devel note
|
||||||
|
|
||||||
|
### Reason behind this
|
||||||
|
|
||||||
|
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||||
|
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||||
|
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||||
|
|
||||||
|
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||||
|
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||||
|
to explore some of the end points and ideas/implications around them.
|
||||||
|
|
||||||
|
|
||||||
|
### General
|
||||||
|
|
||||||
|
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||||
|
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||||
|
|
||||||
|
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||||
|
communicating with the server or only sends the latest user query/message.
|
||||||
|
|
||||||
|
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||||
|
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||||
|
|
||||||
|
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||||
|
irrespective of whether /chat/completions or /completions endpoint.
|
||||||
|
|
||||||
|
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||||
|
modify the existing options value or remove them, for now you can update this global var
|
||||||
|
using browser's development-tools/console.
|
||||||
|
|
||||||
|
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||||
|
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||||
|
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||||
|
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||||
|
only user messages after the latest system message/prompt will be considered.
|
||||||
|
|
||||||
|
This specified sliding window user message count also includes the latest user query.
|
||||||
|
<0 : Send entire chat history to server
|
||||||
|
0 : Send only the system message if any to the server
|
||||||
|
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||||
|
|
||||||
|
|
||||||
|
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||||
|
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||||
|
some extent in a simple crude way.
|
||||||
|
|
||||||
|
|
||||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||||
|
|
||||||
|
|
||||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||||
not been added, for now.
|
not been added, for now.
|
||||||
|
|
||||||
|
|
||||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||||
the system prompt, anytime during the conversation or only at the beginning.
|
the system prompt, anytime during the conversation or only at the beginning.
|
||||||
|
|
||||||
|
|
||||||
|
read_json_early, is to experiment with reading json response data early on, if available,
|
||||||
|
so that user can be shown generated data, as and when it is being generated, rather than
|
||||||
|
at the end when full data is available.
|
||||||
|
|
||||||
|
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||||
|
that is currently sent.
|
||||||
|
|
||||||
|
if able to read json data early on in future, as and when ai model is generating data, then
|
||||||
|
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||||
|
for the overall data to be available.
|
||||||
|
|
||||||
|
|
||||||
|
### Default setup
|
||||||
|
|
||||||
|
By default things are setup to try and make the user experience a bit better, if possible.
|
||||||
|
However a developer when testing the server of ai-model may want to change these value.
|
||||||
|
|
||||||
|
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||||
|
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||||
|
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||||
|
mess with things beyond the next question/request/query, in some ways.
|
||||||
|
|
||||||
|
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||||
|
available wrt next query-response. However dont forget that the server when started should
|
||||||
|
also be started with a model context size of 1k or more, to be on safe side.
|
||||||
|
|
||||||
|
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||||
|
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||||
|
to /completions endpoint handling code on server side.
|
||||||
|
|
||||||
|
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||||
|
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||||
|
its response.
|
||||||
|
|
||||||
|
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||||
|
|
||||||
|
|
||||||
|
## At the end
|
||||||
|
|
||||||
|
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||||
|
@ -48,6 +48,13 @@ button {
|
|||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.ul1 {
|
||||||
|
padding-inline-start: 2vw;
|
||||||
|
}
|
||||||
|
.ul2 {
|
||||||
|
padding-inline-start: 2vw;
|
||||||
|
}
|
||||||
|
|
||||||
* {
|
* {
|
||||||
margin: 0.6vmin;
|
margin: 0.6vmin;
|
||||||
}
|
}
|
||||||
|
@ -14,23 +14,86 @@ class ApiEP {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let gUsageMsg = `
|
let gUsageMsg = `
|
||||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
<p class="role-system">Usage</p>
|
||||||
<p> Enter your text to the ai assistant below.</p>
|
<ul class="ul1">
|
||||||
<p> Use shift+enter for inserting enter.</p>
|
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||||
<p> Refresh the page to start over fresh.</p>
|
<ul class="ul2">
|
||||||
|
<li> Completion mode normally wont have a system prompt.</li>
|
||||||
|
</ul>
|
||||||
|
<li> Enter your query to ai assistant below.</li>
|
||||||
|
<ul class="ul2">
|
||||||
|
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||||
|
<li> Use shift+enter for inserting enter/newline.</li>
|
||||||
|
</ul>
|
||||||
|
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||||
|
<ul class="ul2">
|
||||||
|
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||||
|
</ul>
|
||||||
|
</ul>
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||||
|
|
||||||
class SimpleChat {
|
class SimpleChat {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
/**
|
/**
|
||||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||||
* @type {{role: string, content: string}[]}
|
* @type {ChatMessages}
|
||||||
*/
|
*/
|
||||||
this.xchat = [];
|
this.xchat = [];
|
||||||
this.iLastSys = -1;
|
this.iLastSys = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
clear() {
|
||||||
|
this.xchat = [];
|
||||||
|
this.iLastSys = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recent chat messages.
|
||||||
|
* If iRecentUserMsgCnt < 0
|
||||||
|
* Then return the full chat history
|
||||||
|
* Else
|
||||||
|
* Return chat messages from latest going back till the last/latest system prompt.
|
||||||
|
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||||
|
* @param {number} iRecentUserMsgCnt
|
||||||
|
*/
|
||||||
|
recent_chat(iRecentUserMsgCnt) {
|
||||||
|
if (iRecentUserMsgCnt < 0) {
|
||||||
|
return this.xchat;
|
||||||
|
}
|
||||||
|
if (iRecentUserMsgCnt == 0) {
|
||||||
|
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||||
|
}
|
||||||
|
/** @type{ChatMessages} */
|
||||||
|
let rchat = [];
|
||||||
|
let sysMsg = this.get_system_latest();
|
||||||
|
if (sysMsg.length != 0) {
|
||||||
|
rchat.push({role: Roles.System, content: sysMsg});
|
||||||
|
}
|
||||||
|
let iUserCnt = 0;
|
||||||
|
let iStart = this.xchat.length;
|
||||||
|
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||||
|
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let msg = this.xchat[i];
|
||||||
|
if (msg.role == Roles.User) {
|
||||||
|
iStart = i;
|
||||||
|
iUserCnt += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(let i = iStart; i < this.xchat.length; i++) {
|
||||||
|
let msg = this.xchat[i];
|
||||||
|
if (msg.role == Roles.System) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
rchat.push({role: msg.role, content: msg.content});
|
||||||
|
}
|
||||||
|
return rchat;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add an entry into xchat
|
* Add an entry into xchat
|
||||||
* @param {string} role
|
* @param {string} role
|
||||||
@ -57,7 +120,7 @@ class SimpleChat {
|
|||||||
div.replaceChildren();
|
div.replaceChildren();
|
||||||
}
|
}
|
||||||
let last = undefined;
|
let last = undefined;
|
||||||
for(const x of this.xchat) {
|
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||||
let entry = document.createElement("p");
|
let entry = document.createElement("p");
|
||||||
entry.className = `role-${x.role}`;
|
entry.className = `role-${x.role}`;
|
||||||
entry.innerText = `${x.role}: ${x.content}`;
|
entry.innerText = `${x.role}: ${x.content}`;
|
||||||
@ -69,17 +132,21 @@ class SimpleChat {
|
|||||||
} else {
|
} else {
|
||||||
if (bClear) {
|
if (bClear) {
|
||||||
div.innerHTML = gUsageMsg;
|
div.innerHTML = gUsageMsg;
|
||||||
|
gMe.show_info(div);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
|
* The needed fields/options are picked from a global object.
|
||||||
* Convert the json into string.
|
* Convert the json into string.
|
||||||
* @param {Object} obj
|
* @param {Object} obj
|
||||||
*/
|
*/
|
||||||
request_jsonstr(obj) {
|
request_jsonstr(obj) {
|
||||||
obj["temperature"] = 0.7;
|
for(let k in gMe.chatRequestOptions) {
|
||||||
|
obj[k] = gMe.chatRequestOptions[k];
|
||||||
|
}
|
||||||
return JSON.stringify(obj);
|
return JSON.stringify(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,18 +155,27 @@ class SimpleChat {
|
|||||||
*/
|
*/
|
||||||
request_messages_jsonstr() {
|
request_messages_jsonstr() {
|
||||||
let req = {
|
let req = {
|
||||||
messages: this.xchat,
|
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||||
}
|
}
|
||||||
return this.request_jsonstr(req);
|
return this.request_jsonstr(req);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a string form of json object suitable for /completions
|
* Return a string form of json object suitable for /completions
|
||||||
|
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||||
*/
|
*/
|
||||||
request_prompt_jsonstr() {
|
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||||
let prompt = "";
|
let prompt = "";
|
||||||
for(const chat of this.xchat) {
|
let iCnt = 0;
|
||||||
prompt += `${chat.role}: ${chat.content}\n`;
|
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||||
|
iCnt += 1;
|
||||||
|
if (iCnt > 1) {
|
||||||
|
prompt += "\n";
|
||||||
|
}
|
||||||
|
if (bInsertStandardRolePrefix) {
|
||||||
|
prompt += `${chat.role}: `;
|
||||||
|
}
|
||||||
|
prompt += `${chat.content}`;
|
||||||
}
|
}
|
||||||
let req = {
|
let req = {
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
@ -171,7 +247,6 @@ let gChatURL = {
|
|||||||
'chat': `${gBaseURL}/chat/completions`,
|
'chat': `${gBaseURL}/chat/completions`,
|
||||||
'completion': `${gBaseURL}/completions`,
|
'completion': `${gBaseURL}/completions`,
|
||||||
}
|
}
|
||||||
const gbCompletionFreshChatAlways = true;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -291,6 +366,8 @@ class MultiChatUI {
|
|||||||
// allow user to insert enter into their message using shift+enter.
|
// allow user to insert enter into their message using shift+enter.
|
||||||
// while just pressing enter key will lead to submitting.
|
// while just pressing enter key will lead to submitting.
|
||||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||||
|
let value = this.elInUser.value;
|
||||||
|
this.elInUser.value = value.substring(0,value.length-1);
|
||||||
this.elBtnUser.click();
|
this.elBtnUser.click();
|
||||||
ev.preventDefault();
|
ev.preventDefault();
|
||||||
}
|
}
|
||||||
@ -321,6 +398,29 @@ class MultiChatUI {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try read json response early, if available.
|
||||||
|
* @param {Response} resp
|
||||||
|
*/
|
||||||
|
async read_json_early(resp) {
|
||||||
|
if (!resp.body) {
|
||||||
|
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||||
|
}
|
||||||
|
let tdUtf8 = new TextDecoder("utf-8");
|
||||||
|
let rr = resp.body.getReader();
|
||||||
|
let gotBody = "";
|
||||||
|
while(true) {
|
||||||
|
let { value: cur, done: done} = await rr.read();
|
||||||
|
let curBody = tdUtf8.decode(cur);
|
||||||
|
console.debug("DBUG:SC:PART:", curBody);
|
||||||
|
gotBody += curBody;
|
||||||
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return JSON.parse(gotBody);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handle user query submit request, wrt specified chat session.
|
* Handle user query submit request, wrt specified chat session.
|
||||||
* @param {string} chatId
|
* @param {string} chatId
|
||||||
@ -330,6 +430,14 @@ class MultiChatUI {
|
|||||||
|
|
||||||
let chat = this.simpleChats[chatId];
|
let chat = this.simpleChats[chatId];
|
||||||
|
|
||||||
|
// In completion mode, if configured, clear any previous chat history.
|
||||||
|
// So if user wants to simulate a multi-chat based completion query,
|
||||||
|
// they will have to enter the full thing, as a suitable multiline
|
||||||
|
// user input/query.
|
||||||
|
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||||
|
chat.clear();
|
||||||
|
}
|
||||||
|
|
||||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||||
|
|
||||||
let content = this.elInUser.value;
|
let content = this.elInUser.value;
|
||||||
@ -344,7 +452,7 @@ class MultiChatUI {
|
|||||||
if (apiEP == ApiEP.Chat) {
|
if (apiEP == ApiEP.Chat) {
|
||||||
theBody = chat.request_messages_jsonstr();
|
theBody = chat.request_messages_jsonstr();
|
||||||
} else {
|
} else {
|
||||||
theBody = chat.request_prompt_jsonstr();
|
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.elInUser.value = "working...";
|
this.elInUser.value = "working...";
|
||||||
@ -359,6 +467,7 @@ class MultiChatUI {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let respBody = await resp.json();
|
let respBody = await resp.json();
|
||||||
|
//let respBody = await this.read_json_early(resp);
|
||||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||||
let assistantMsg;
|
let assistantMsg;
|
||||||
if (apiEP == ApiEP.Chat) {
|
if (apiEP == ApiEP.Chat) {
|
||||||
@ -376,13 +485,6 @@ class MultiChatUI {
|
|||||||
} else {
|
} else {
|
||||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||||
}
|
}
|
||||||
// Purposefully clear at end rather than begin of this function
|
|
||||||
// so that one can switch from chat to completion mode and sequece
|
|
||||||
// in a completion mode with multiple user-assistant chat data
|
|
||||||
// from before to be sent/occur once.
|
|
||||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
|
||||||
chat.xchat.length = 0;
|
|
||||||
}
|
|
||||||
this.ui_reset_userinput();
|
this.ui_reset_userinput();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -462,17 +564,66 @@ class MultiChatUI {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
let gMuitChat;
|
class Me {
|
||||||
const gChatIds = [ "Default", "Other" ];
|
|
||||||
|
constructor() {
|
||||||
|
this.defaultChatIds = [ "Default", "Other" ];
|
||||||
|
this.multiChat = new MultiChatUI();
|
||||||
|
this.bCompletionFreshChatAlways = true;
|
||||||
|
this.bCompletionInsertStandardRolePrefix = false;
|
||||||
|
this.iRecentUserMsgCnt = 2;
|
||||||
|
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
|
this.chatRequestOptions = {
|
||||||
|
"temperature": 0.7,
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"frequency_penalty": 1.2,
|
||||||
|
"presence_penalty": 1.2,
|
||||||
|
"n_predict": 1024
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {HTMLDivElement} elDiv
|
||||||
|
*/
|
||||||
|
show_info(elDiv) {
|
||||||
|
|
||||||
|
var p = document.createElement("p");
|
||||||
|
p.innerText = "Settings (devel-tools-console gMe)";
|
||||||
|
p.className = "role-system";
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
var p = document.createElement("p");
|
||||||
|
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** @type {Me} */
|
||||||
|
let gMe;
|
||||||
|
|
||||||
function startme() {
|
function startme() {
|
||||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||||
gMuitChat = new MultiChatUI();
|
gMe = new Me();
|
||||||
for (let cid of gChatIds) {
|
for (let cid of gMe.defaultChatIds) {
|
||||||
gMuitChat.new_chat_session(cid);
|
gMe.multiChat.new_chat_session(cid);
|
||||||
}
|
}
|
||||||
gMuitChat.setup_ui(gChatIds[0]);
|
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||||
gMuitChat.show_sessions();
|
gMe.multiChat.show_sessions();
|
||||||
}
|
}
|
||||||
|
|
||||||
document.addEventListener("DOMContentLoaded", startme);
|
document.addEventListener("DOMContentLoaded", startme);
|
||||||
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
@ -3,40 +3,390 @@
|
|||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
#if defined(_WIN32)
|
||||||
if (argc < 3 || argv[1][0] == '-') {
|
#define WIN32_LEAN_AND_MEAN
|
||||||
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
#include <windows.h>
|
||||||
|
#include <shellapi.h> // For CommandLineToArgvW
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||||
|
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||||
|
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||||
|
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||||
|
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||||
|
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||||
|
fprintf(stream, " The possible options are:\n");
|
||||||
|
fprintf(stream, "\n");
|
||||||
|
fprintf(stream, " -h, --help print this help and exit\n");
|
||||||
|
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||||
|
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||||
|
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||||
|
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||||
|
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||||
|
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||||
|
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||||
|
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
(void) level;
|
||||||
|
(void) text;
|
||||||
|
(void) user_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
||||||
|
success = false;
|
||||||
|
|
||||||
|
std::ifstream in(filepath, std::ios::binary);
|
||||||
|
if (!in) {
|
||||||
|
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
||||||
|
return std::string();
|
||||||
|
}
|
||||||
|
// do not assume the file is seekable (e.g. /dev/stdin)
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << in.rdbuf();
|
||||||
|
if (in.fail()) {
|
||||||
|
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
||||||
|
return std::string();
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true;
|
||||||
|
return buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Function: ingest_args(...) -> vector<string>
|
||||||
|
//
|
||||||
|
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
||||||
|
// strings, as an STL vector<string>.
|
||||||
|
//
|
||||||
|
// In particular, it handles character encoding shenanigans on Windows.
|
||||||
|
//
|
||||||
|
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
||||||
|
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
||||||
|
// format, ignoring the regular argc/argv arguments to main().
|
||||||
|
//
|
||||||
|
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
||||||
|
// in relation to Windows wchar_t shenanigans.
|
||||||
|
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
||||||
|
std::vector<std::string> argv;
|
||||||
|
|
||||||
|
// Handle Windows, if given non-ASCII arguments.
|
||||||
|
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
||||||
|
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
||||||
|
// without throwing tantrums.
|
||||||
|
#if defined(_WIN32)
|
||||||
|
int argc;
|
||||||
|
const LPWSTR cmdline_wargv = GetCommandLineW();
|
||||||
|
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
||||||
|
|
||||||
|
// silence unused arg warnings
|
||||||
|
(void) raw_argc;
|
||||||
|
(void) raw_argv;
|
||||||
|
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
||||||
|
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
||||||
|
GGML_ASSERT(output_buf);
|
||||||
|
|
||||||
|
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
||||||
|
output_buf[length_needed] = '\0';
|
||||||
|
|
||||||
|
argv.push_back(output_buf);
|
||||||
|
free(output_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalFree((HLOCAL) wargv);
|
||||||
|
#else
|
||||||
|
int argc = raw_argc;
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
argv.push_back(raw_argv[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GGML_ASSERT((unsigned int) argc == argv.size());
|
||||||
|
|
||||||
|
return argv;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
||||||
|
//
|
||||||
|
// writes a string to standard output; taking into account that on Windows
|
||||||
|
// to display correctly you have to use special handling. Works even if the
|
||||||
|
// user has not set a unicode code page on a Windows cmd.exe.
|
||||||
|
//
|
||||||
|
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
||||||
|
// a human-readable is written instead.
|
||||||
|
//
|
||||||
|
// On non-Windows systems, simply printfs() the string.
|
||||||
|
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||||
|
invalid_utf8 = false;
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
// Are we in a console?
|
||||||
|
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||||
|
DWORD dwMode = 0;
|
||||||
|
|
||||||
|
// According to Microsoft docs:
|
||||||
|
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
||||||
|
// Also according to the docs, you can use GetConsoleMode to check for that.
|
||||||
|
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
||||||
|
printf("%s", str);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// MultiByteToWideChar reports an error if str is empty, don't report
|
||||||
|
// them as invalid_utf8.
|
||||||
|
if (*str == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
||||||
|
if (length_needed == 0) {
|
||||||
|
DWORD err = GetLastError();
|
||||||
|
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
||||||
|
invalid_utf8 = true;
|
||||||
|
int len = strlen(str);
|
||||||
|
printf("<");
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
if (i > 0) {
|
||||||
|
printf(" ");
|
||||||
|
}
|
||||||
|
printf("%02x", (uint8_t) str[i]);
|
||||||
|
}
|
||||||
|
printf(">");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||||
|
}
|
||||||
|
|
||||||
|
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||||
|
GGML_ASSERT(wstr);
|
||||||
|
|
||||||
|
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
||||||
|
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
||||||
|
|
||||||
|
free(wstr);
|
||||||
|
#else
|
||||||
|
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
||||||
|
// printf will silently just write bad unicode.
|
||||||
|
printf("%s", str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int raw_argc, char ** raw_argv) {
|
||||||
|
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
||||||
|
const int argc = argv.size();
|
||||||
|
|
||||||
|
if (argc <= 1) {
|
||||||
|
print_usage_information(argv[0].c_str(), stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * model_path = argv[1];
|
//////
|
||||||
const char * prompt = argv[2];
|
// Read out all the command line arguments.
|
||||||
|
//////
|
||||||
|
|
||||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
// variables where to put any arguments we see.
|
||||||
|
bool printing_ids = false;
|
||||||
|
bool no_bos = false;
|
||||||
|
bool disable_logging = false;
|
||||||
|
const char * model_path = NULL;
|
||||||
|
const char * prompt_path = NULL;
|
||||||
|
const char * prompt_arg = NULL;
|
||||||
|
|
||||||
|
// track which arguments were explicitly given
|
||||||
|
// used for sanity checking down the line
|
||||||
|
bool model_path_set = false;
|
||||||
|
bool prompt_path_set = false;
|
||||||
|
bool prompt_set = false;
|
||||||
|
bool stdin_set = false;
|
||||||
|
|
||||||
|
int iarg = 1;
|
||||||
|
for (; iarg < argc; ++iarg) {
|
||||||
|
std::string arg{argv[iarg]};
|
||||||
|
if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage_information(argv[0].c_str(), stdout);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (arg == "--ids") {
|
||||||
|
printing_ids = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-m" || arg == "--model") {
|
||||||
|
if (model_path_set) {
|
||||||
|
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
model_path = argv[++iarg].c_str();
|
||||||
|
model_path_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--no-bos") {
|
||||||
|
no_bos = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-p" || arg == "--prompt") {
|
||||||
|
if (prompt_set) {
|
||||||
|
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
prompt_arg = argv[++iarg].c_str();
|
||||||
|
prompt_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-f" || arg == "--file") {
|
||||||
|
if (prompt_path_set) {
|
||||||
|
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
prompt_path = argv[++iarg].c_str();
|
||||||
|
prompt_path_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--stdin") {
|
||||||
|
stdin_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--log-disable") {
|
||||||
|
disable_logging = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Sanity check the command line arguments.
|
||||||
|
//////
|
||||||
|
|
||||||
|
// Check that we have the required stuff set.
|
||||||
|
if (model_path_set && model_path == NULL) {
|
||||||
|
fprintf(stderr, "Error: --model requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!model_path_set) {
|
||||||
|
fprintf(stderr, "Error: must specify --model.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (prompt_path_set && prompt_path == NULL) {
|
||||||
|
fprintf(stderr, "Error: --file requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (prompt_set && prompt_arg == NULL) {
|
||||||
|
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
||||||
|
if (prompts_set > 1) {
|
||||||
|
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
// Must have some prompt.
|
||||||
|
if (prompts_set == 0) {
|
||||||
|
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(model_path);
|
||||||
|
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Figure out where will the prompt come from.
|
||||||
|
//////
|
||||||
|
|
||||||
|
std::string prompt;
|
||||||
|
if (prompt_path_set) {
|
||||||
|
bool success = false;
|
||||||
|
prompt = read_prompt_from_file(prompt_path, success);
|
||||||
|
if (!success) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (prompt_set) {
|
||||||
|
prompt = prompt_arg;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(stdin_set);
|
||||||
|
// we read stdin *after* loading model (early exit if model cannot
|
||||||
|
// be loaded, which can be a nicer user experience)
|
||||||
|
}
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Start actually doing the tokenizing stuff.
|
||||||
|
//////
|
||||||
|
|
||||||
|
#ifdef LOG_DISABLE_LOGS
|
||||||
|
disable_logging = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (disable_logging) {
|
||||||
|
llama_log_set(llama_log_callback_null, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.vocab_only = true;
|
model_params.vocab_only = true;
|
||||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||||
|
if (!model) {
|
||||||
|
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr, "Error: could not create context.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// read entire prompt from stdin?
|
||||||
|
if (stdin_set) {
|
||||||
|
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
||||||
|
|
||||||
|
std::stringstream stdin_buffer;
|
||||||
|
stdin_buffer << std::cin.rdbuf();
|
||||||
|
if (std::cin.fail()) {
|
||||||
|
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = stdin_buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||||
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||||
|
|
||||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
if (printing_ids) {
|
||||||
|
printf("[");
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
printf("%d\n", tokens[i]);
|
if (i > 0) {
|
||||||
|
printf(", ");
|
||||||
|
}
|
||||||
|
printf("%d", tokens[i]);
|
||||||
} else {
|
} else {
|
||||||
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
bool invalid_utf8 = false;
|
||||||
|
printf("%6d -> '", tokens[i]);
|
||||||
|
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||||
|
if (invalid_utf8) {
|
||||||
|
printf("' (utf-8 decode failure)\n");
|
||||||
|
} else {
|
||||||
|
printf("'\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (printing_ids) {
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// silence valgrind
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
12
flake.lock
generated
12
flake.lock
generated
@ -5,11 +5,11 @@
|
|||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1714641030,
|
"lastModified": 1715865404,
|
||||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
"narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
"rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@ -20,11 +20,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1714635257,
|
"lastModified": 1716509168,
|
||||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
"narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
"rev": "bfb7a882678e518398ce9a31a881538679f6f092",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
33
ggml-cuda.cu
33
ggml-cuda.cu
@ -119,6 +119,20 @@ int ggml_cuda_get_device() {
|
|||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
|
ggml_cuda_set_device(device);
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||||
|
auto res = hipMallocManaged(ptr, size);
|
||||||
|
if (res == hipSuccess) {
|
||||||
|
// if error we "need" to know why...
|
||||||
|
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
#else
|
||||||
|
return cudaMalloc(ptr, size);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_cuda_device_info ggml_cuda_init() {
|
static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||||
@ -271,7 +285,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
pool_size += look_ahead_size;
|
pool_size += look_ahead_size;
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|||||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||||
|
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|||||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
char * buf;
|
char * buf;
|
||||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||||
|
|
||||||
// set padding to 0 to avoid possible NaN values
|
// set padding to 0 to avoid possible NaN values
|
||||||
if (size > original_size) {
|
if (size > original_size) {
|
||||||
@ -2510,9 +2524,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
|
|
||||||
bool use_cuda_graph = true;
|
bool use_cuda_graph = true;
|
||||||
bool cuda_graph_update_required = false;
|
bool cuda_graph_update_required = false;
|
||||||
// pointer to CUDA cpy kernel, which is required to identify
|
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||||
// kernel parameters which need updated in the graph for each token
|
// kernel parameters which need updated in the graph for each token
|
||||||
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||||
|
|
||||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||||
@ -2588,9 +2602,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
if (node->op == GGML_OP_CPY) {
|
if (node->op == GGML_OP_CPY) {
|
||||||
// store the copy op parameter which changes with each token.
|
// store the copy op parameter which changes with each token.
|
||||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||||
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
// store a pointer to each copy op CUDA kernel to identify it later
|
||||||
// store a pointer to the copy op CUDA kernel to identify it later
|
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||||
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||||
|
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2720,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||||
int k = 0;
|
int k = 0;
|
||||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||||
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||||
|
@ -79,13 +79,8 @@
|
|||||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
#define cudaHostUnregister hipHostUnregister
|
#define cudaHostUnregister hipHostUnregister
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
#ifdef GGML_HIP_UMA
|
|
||||||
#define cudaMalloc hipMallocManaged
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
|
||||||
#else
|
|
||||||
#define cudaMalloc hipMalloc
|
#define cudaMalloc hipMalloc
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
#endif
|
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
@ -1,15 +1,68 @@
|
|||||||
#include "concat.cuh"
|
#include "concat.cuh"
|
||||||
|
|
||||||
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (nidx >= ne0) {
|
if (nidx >= ne0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// operation
|
|
||||||
int offset_dst =
|
int offset_dst =
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
blockIdx.z * ne0 * gridDim.y;
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
|
if (nidx < ne00) { // src0
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne00 +
|
||||||
|
blockIdx.z * ne00 * gridDim.y;
|
||||||
|
dst[offset_dst] = x[offset_src];
|
||||||
|
} else {
|
||||||
|
int offset_src =
|
||||||
|
(nidx - ne00) +
|
||||||
|
blockIdx.y * (ne0 - ne00) +
|
||||||
|
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||||
|
dst[offset_dst] = y[offset_src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset_dst =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
|
if (blockIdx.y < ne01) { // src0
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * ne01;
|
||||||
|
dst[offset_dst] = x[offset_src];
|
||||||
|
} else {
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
(blockIdx.y - ne01) * ne0 +
|
||||||
|
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||||
|
dst[offset_dst] = y[offset_src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset_dst =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
if (blockIdx.z < ne02) { // src0
|
if (blockIdx.z < ne02) { // src0
|
||||||
int offset_src =
|
int offset_src =
|
||||||
nidx +
|
nidx +
|
||||||
@ -25,25 +78,53 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||||
dim3 gridDim(num_blocks, ne1, ne2);
|
dim3 gridDim(num_blocks, ne1, ne2);
|
||||||
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
if (dim == 0) {
|
||||||
|
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (dim == 1) {
|
||||||
|
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
const float * src0_d = (const float *)src0->data;
|
||||||
const float * src1_d = (const float *)src1->data;
|
const float * src1_d = (const float *)src1->data;
|
||||||
|
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
if (dim != 3) {
|
||||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||||
concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
|
concat_f32_cuda(
|
||||||
|
src0_d + i3 * (src0->nb[3] / 4),
|
||||||
|
src1_d + i3 * (src1->nb[3] / 4),
|
||||||
|
dst_d + i3 * ( dst->nb[3] / 4),
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||||
|
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const size_t size0 = ggml_nbytes(src0);
|
||||||
|
const size_t size1 = ggml_nbytes(src1);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -144,6 +144,10 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
#include <arm_sve.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
|
71
ggml-metal.m
71
ggml-metal.m
@ -35,6 +35,10 @@ enum ggml_metal_kernel_type {
|
|||||||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||||
GGML_METAL_KERNEL_TYPE_DIV,
|
GGML_METAL_KERNEL_TYPE_DIV,
|
||||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||||
|
GGML_METAL_KERNEL_TYPE_REPEAT_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_REPEAT_F16,
|
||||||
|
GGML_METAL_KERNEL_TYPE_REPEAT_I32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_REPEAT_I16,
|
||||||
GGML_METAL_KERNEL_TYPE_SCALE,
|
GGML_METAL_KERNEL_TYPE_SCALE,
|
||||||
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
||||||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||||
@ -184,9 +188,9 @@ enum ggml_metal_kernel_type {
|
|||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
|
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||||
@ -485,6 +489,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||||
@ -634,9 +642,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||||
@ -746,6 +754,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
|
case GGML_OP_REPEAT:
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
@ -770,6 +779,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
|
if (op->src[0]->ne[0] == 256) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
@ -976,10 +988,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
case GGML_OP_CONCAT:
|
case GGML_OP_CONCAT:
|
||||||
{
|
{
|
||||||
const int64_t nb = ne00;
|
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
||||||
|
|
||||||
|
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
@ -1008,7 +1020,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
[encoder setBytes:&dim length:sizeof(dim) atIndex:27];
|
||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
@ -1018,11 +1030,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||||
|
|
||||||
const size_t offs = 0;
|
const size_t offs = 0;
|
||||||
|
|
||||||
bool bcast_row = false;
|
bool bcast_row = false;
|
||||||
|
|
||||||
int64_t nb = ne00;
|
int64_t nb = ne00; // used by the "row" kernels
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
@ -1091,6 +1106,42 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_REPEAT:
|
||||||
|
{
|
||||||
|
id<MTLComputePipelineState> pipeline;
|
||||||
|
|
||||||
|
switch (src0t) {
|
||||||
|
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
|
||||||
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||||
|
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||||
|
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||||
|
default: GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||||
|
|
||||||
|
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||||
@ -2573,7 +2624,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
||||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
||||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
||||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||||
@ -2586,7 +2637,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
switch (ne00) {
|
switch (ne00) {
|
||||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
||||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||||
|
@ -168,6 +168,53 @@ kernel void kernel_div(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
kernel void kernel_repeat(
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2 = tgpig.y;
|
||||||
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i03 = i3 % ne03;
|
||||||
|
const int64_t i02 = i2 % ne02;
|
||||||
|
const int64_t i01 = i1 % ne01;
|
||||||
|
|
||||||
|
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ;
|
||||||
|
|
||||||
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
const int i00 = i0 % ne00;
|
||||||
|
*((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||||
|
|
||||||
|
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||||
|
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||||
|
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||||
|
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||||
|
|
||||||
// assumption: src1 is a row
|
// assumption: src1 is a row
|
||||||
// broadcast src1 into src0
|
// broadcast src1 into src0
|
||||||
kernel void kernel_add_row(
|
kernel void kernel_add_row(
|
||||||
@ -2418,7 +2465,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f
|
|||||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
|
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
|
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
|
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||||
|
|
||||||
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
|
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
|
||||||
kernel void kernel_flash_attn_ext_vec_f16(
|
kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
@ -2696,7 +2743,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||||
|
|
||||||
kernel void kernel_cpy_f16_f16(
|
kernel void kernel_cpy_f16_f16(
|
||||||
device const half * src0,
|
device const half * src0,
|
||||||
@ -3319,31 +3366,30 @@ kernel void kernel_concat(
|
|||||||
constant uint64_t & nb1,
|
constant uint64_t & nb1,
|
||||||
constant uint64_t & nb2,
|
constant uint64_t & nb2,
|
||||||
constant uint64_t & nb3,
|
constant uint64_t & nb3,
|
||||||
|
constant int32_t & dim,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
const int64_t i03 = tgpig.z;
|
const int64_t i3 = tgpig.z;
|
||||||
const int64_t i02 = tgpig.y;
|
const int64_t i2 = tgpig.y;
|
||||||
const int64_t i01 = tgpig.x;
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
const int64_t i13 = i03 % ne13;
|
int64_t o[4] = {0, 0, 0, 0};
|
||||||
const int64_t i12 = i02 % ne12;
|
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||||
const int64_t i11 = i01 % ne11;
|
|
||||||
|
|
||||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
device const float * x;
|
||||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
|
||||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
|
||||||
|
|
||||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
if (i02 < ne02) {
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
x = (device const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||||
src0_ptr += ntg.x*nb00;
|
|
||||||
} else {
|
} else {
|
||||||
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||||
src1_ptr += ntg.x*nb10;
|
|
||||||
}
|
}
|
||||||
dst_ptr += ntg.x*nb0;
|
|
||||||
|
device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
*y = *x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3813,7 +3813,44 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||||
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||||
|
|
||||||
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i += 2) {
|
||||||
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||||
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||||
|
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||||
|
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||||
|
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
@ -5384,7 +5421,32 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i += 2) {
|
||||||
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||||
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||||
|
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
@ -12144,7 +12206,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
q2[2*ib+0] |= (grid_index << 8*k);
|
q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
|
||||||
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(scale >= 0);
|
GGML_ASSERT(scale >= 0);
|
||||||
|
127
ggml-rpc.cpp
127
ggml-rpc.cpp
@ -6,6 +6,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -47,6 +48,7 @@ struct socket_t {
|
|||||||
sockfd_t fd;
|
sockfd_t fd;
|
||||||
socket_t(sockfd_t fd) : fd(fd) {}
|
socket_t(sockfd_t fd) : fd(fd) {}
|
||||||
~socket_t() {
|
~socket_t() {
|
||||||
|
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
closesocket(this->fd);
|
closesocket(this->fd);
|
||||||
#else
|
#else
|
||||||
@ -97,7 +99,7 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_rpc_buffer_type_context {
|
struct ggml_backend_rpc_buffer_type_context {
|
||||||
std::shared_ptr<socket_t> sock;
|
std::string endpoint;
|
||||||
std::string name;
|
std::string name;
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
size_t max_size;
|
size_t max_size;
|
||||||
@ -106,8 +108,6 @@ struct ggml_backend_rpc_buffer_type_context {
|
|||||||
struct ggml_backend_rpc_context {
|
struct ggml_backend_rpc_context {
|
||||||
std::string endpoint;
|
std::string endpoint;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::shared_ptr<socket_t> sock;
|
|
||||||
ggml_backend_buffer_type_t buft;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_rpc_buffer_context {
|
struct ggml_backend_rpc_buffer_context {
|
||||||
@ -231,14 +231,13 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool parse_endpoint(const char * endpoint, std::string & host, int & port) {
|
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
||||||
std::string str(endpoint);
|
size_t pos = endpoint.find(':');
|
||||||
size_t pos = str.find(':');
|
|
||||||
if (pos == std::string::npos) {
|
if (pos == std::string::npos) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
host = str.substr(0, pos);
|
host = endpoint.substr(0, pos);
|
||||||
port = std::stoi(str.substr(pos + 1));
|
port = std::stoi(endpoint.substr(pos + 1));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -273,6 +272,44 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|||||||
|
|
||||||
// RPC client-side implementation
|
// RPC client-side implementation
|
||||||
|
|
||||||
|
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||||
|
static std::mutex mutex;
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
||||||
|
static bool initialized = false;
|
||||||
|
|
||||||
|
auto it = sockets.find(endpoint);
|
||||||
|
if (it != sockets.end()) {
|
||||||
|
if (auto sock = it->second.lock()) {
|
||||||
|
return sock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::string host;
|
||||||
|
int port;
|
||||||
|
if (!parse_endpoint(endpoint, host, port)) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#ifdef _WIN32
|
||||||
|
if (!initialized) {
|
||||||
|
WSADATA wsaData;
|
||||||
|
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||||
|
if (res != 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
UNUSED(initialized);
|
||||||
|
#endif
|
||||||
|
auto sock = socket_connect(host.c_str(), port);
|
||||||
|
if (sock == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||||
|
sockets[endpoint] = sock;
|
||||||
|
return sock;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
@ -442,7 +479,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|||||||
std::vector<uint8_t> input(input_size, 0);
|
std::vector<uint8_t> input(input_size, 0);
|
||||||
memcpy(input.data(), &size, sizeof(size));
|
memcpy(input.data(), &size, sizeof(size));
|
||||||
std::vector<uint8_t> output;
|
std::vector<uint8_t> output;
|
||||||
bool status = send_rpc_cmd(buft_ctx->sock, ALLOC_BUFFER, input, output);
|
auto sock = get_socket(buft_ctx->endpoint);
|
||||||
|
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
|
||||||
GGML_ASSERT(status);
|
GGML_ASSERT(status);
|
||||||
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
||||||
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
||||||
@ -453,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|||||||
if (remote_ptr != 0) {
|
if (remote_ptr != 0) {
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||||
ggml_backend_rpc_buffer_interface,
|
ggml_backend_rpc_buffer_interface,
|
||||||
new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
|
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
||||||
remote_size);
|
remote_size);
|
||||||
return buffer;
|
return buffer;
|
||||||
} else {
|
} else {
|
||||||
@ -508,7 +546,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend
|
|||||||
}
|
}
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
return buft_ctx->sock == rpc_ctx->sock;
|
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
@ -521,7 +559,6 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
|
|
||||||
@ -530,16 +567,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
|||||||
|
|
||||||
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
|
|
||||||
delete buft_ctx;
|
|
||||||
delete rpc_ctx->buft;
|
|
||||||
delete rpc_ctx;
|
delete rpc_ctx;
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
return ctx->buft;
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||||
@ -590,7 +624,8 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|||||||
std::vector<uint8_t> input;
|
std::vector<uint8_t> input;
|
||||||
serialize_graph(cgraph, input);
|
serialize_graph(cgraph, input);
|
||||||
std::vector<uint8_t> output;
|
std::vector<uint8_t> output;
|
||||||
bool status = send_rpc_cmd(rpc_ctx->sock, GRAPH_COMPUTE, input, output);
|
auto sock = get_socket(rpc_ctx->endpoint);
|
||||||
|
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
|
||||||
GGML_ASSERT(status);
|
GGML_ASSERT(status);
|
||||||
GGML_ASSERT(output.size() == 1);
|
GGML_ASSERT(output.size() == 1);
|
||||||
return (enum ggml_status)output[0];
|
return (enum ggml_status)output[0];
|
||||||
@ -624,42 +659,24 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||||||
/* .event_synchronize = */ NULL,
|
/* .event_synchronize = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::unordered_map<std::string, ggml_backend_t> instances;
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
static std::mutex mutex;
|
||||||
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
}
|
// NOTE: buffer types are allocated and never freed; this is by design
|
||||||
|
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
||||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
auto it = buft_map.find(endpoint);
|
||||||
std::string endpoint_str(endpoint);
|
if (it != buft_map.end()) {
|
||||||
if (instances.find(endpoint_str) != instances.end()) {
|
return it->second;
|
||||||
return instances[endpoint_str];
|
|
||||||
}
|
}
|
||||||
#ifdef _WIN32
|
auto sock = get_socket(endpoint);
|
||||||
{
|
|
||||||
WSADATA wsaData;
|
|
||||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
|
||||||
if (res != 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
fprintf(stderr, "Connecting to %s\n", endpoint);
|
|
||||||
std::string host;
|
|
||||||
int port;
|
|
||||||
if (!parse_endpoint(endpoint, host, port)) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
auto sock = socket_connect(host.c_str(), port);
|
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
size_t alignment = get_alignment(sock);
|
size_t alignment = get_alignment(sock);
|
||||||
size_t max_size = get_max_size(sock);
|
size_t max_size = get_max_size(sock);
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
||||||
/* .sock = */ sock,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||||
/* .alignment = */ alignment,
|
/* .alignment = */ alignment,
|
||||||
/* .max_size = */ max_size
|
/* .max_size = */ max_size
|
||||||
};
|
};
|
||||||
@ -668,21 +685,22 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|||||||
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
||||||
/* .context = */ buft_ctx
|
/* .context = */ buft_ctx
|
||||||
};
|
};
|
||||||
|
buft_map[endpoint] = buft;
|
||||||
|
return buft;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||||
/* .endpoint = */ endpoint,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
/* .name = */ "RPC",
|
||||||
/* .sock = */ sock,
|
|
||||||
/* .buft = */ buft
|
|
||||||
};
|
};
|
||||||
|
|
||||||
instances[endpoint] = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
/* .guid = */ ggml_backend_rpc_guid(),
|
/* .guid = */ ggml_backend_rpc_guid(),
|
||||||
/* .interface = */ ggml_backend_rpc_interface,
|
/* .interface = */ ggml_backend_rpc_interface,
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx
|
||||||
};
|
};
|
||||||
|
return backend;
|
||||||
return instances[endpoint];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||||
@ -706,14 +724,13 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
auto sock = get_socket(endpoint);
|
||||||
if (backend == nullptr) {
|
if (sock == nullptr) {
|
||||||
*free = 0;
|
*free = 0;
|
||||||
*total = 0;
|
*total = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
get_device_memory(sock, free, total);
|
||||||
get_device_memory(ctx->sock, free, total);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// RPC server-side implementation
|
// RPC server-side implementation
|
||||||
|
350
ggml-sycl.cpp
350
ggml-sycl.cpp
@ -2944,6 +2944,57 @@ namespace dpct
|
|||||||
using shared_memory = detail::device_memory<T, shared, Dimension>;
|
using shared_memory = detail::device_memory<T, shared, Dimension>;
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T,
|
||||||
|
sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||||
|
inline T atomic_fetch_add(T *addr, T operand) {
|
||||||
|
auto atm =
|
||||||
|
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||||
|
return atm.fetch_add(operand);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||||
|
typename T1, typename T2>
|
||||||
|
inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
|
||||||
|
auto atm =
|
||||||
|
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||||
|
return atm.fetch_add(operand);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space>
|
||||||
|
inline T atomic_fetch_add(T *addr, T operand,
|
||||||
|
sycl::memory_order memoryOrder) {
|
||||||
|
switch (memoryOrder) {
|
||||||
|
case sycl::memory_order::relaxed:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
case sycl::memory_order::acq_rel:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
case sycl::memory_order::seq_cst:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
default:
|
||||||
|
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||||
|
"atomics are: sycl::memory_order::relaxed, "
|
||||||
|
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
typename T1, typename T2>
|
||||||
|
inline T1 atomic_fetch_add(T1 *addr, T2 operand,
|
||||||
|
sycl::memory_order memoryOrder) {
|
||||||
|
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
||||||
|
}
|
||||||
|
|
||||||
} // COPY from DPCT head files
|
} // COPY from DPCT head files
|
||||||
|
|
||||||
#define GGML_COMMON_DECL_SYCL
|
#define GGML_COMMON_DECL_SYCL
|
||||||
@ -3060,6 +3111,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
|
|||||||
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||||
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||||
int get_main_device();
|
int get_main_device();
|
||||||
|
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
|
||||||
void print_ggml_tensor(const char*name, struct ggml_tensor *src);
|
void print_ggml_tensor(const char*name, struct ggml_tensor *src);
|
||||||
void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
|
void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
|
||||||
|
|
||||||
@ -8830,12 +8882,11 @@ static void rope(
|
|||||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, bool has_pos>
|
template<typename T, bool has_pos, bool has_freq_facs>
|
||||||
static void rope_neox(
|
static void rope_neox(
|
||||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
||||||
,
|
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||||
item_ct1.get_local_id(1));
|
item_ct1.get_local_id(1));
|
||||||
|
|
||||||
@ -8863,8 +8914,10 @@ static void rope_neox(
|
|||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
|
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||||
|
|
||||||
const float theta_base =
|
const float theta_base =
|
||||||
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
@ -12413,7 +12466,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|||||||
const int32_t *pos, float freq_scale,
|
const int32_t *pos, float freq_scale,
|
||||||
int p_delta_rows, float freq_base, float ext_factor,
|
int p_delta_rows, float freq_base, float ext_factor,
|
||||||
float attn_factor, rope_corr_dims corr_dims,
|
float attn_factor, rope_corr_dims corr_dims,
|
||||||
dpct::queue_ptr stream) {
|
const float * freq_factors, dpct::queue_ptr stream) {
|
||||||
GGML_ASSERT(ncols % 2 == 0);
|
GGML_ASSERT(ncols % 2 == 0);
|
||||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||||
@ -12423,39 +12476,49 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|||||||
const float inv_ndims = -1.0f / n_dims;
|
const float inv_ndims = -1.0f / n_dims;
|
||||||
|
|
||||||
if (pos == nullptr) {
|
if (pos == nullptr) {
|
||||||
/*
|
|
||||||
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
corr_dims, theta_scale, inv_ndims,
|
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||||
item_ct1);
|
item_ct1);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
/*
|
|
||||||
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
corr_dims, theta_scale, inv_ndims, item_ct1);
|
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||||
|
item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
|
{sycl::aspect::fp16});
|
||||||
|
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||||
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
|
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||||
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
|
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
||||||
@ -13501,6 +13564,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
|||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
float *dst_dd,
|
float *dst_dd,
|
||||||
const dpct::queue_ptr &main_stream) {
|
const dpct::queue_ptr &main_stream) {
|
||||||
|
#pragma message("TODO: generalize concat kernel for dim != 2")
|
||||||
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
||||||
|
int dim = dst->op_params[0];
|
||||||
|
GGML_ASSERT(dim != 2);
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
@ -13986,9 +14053,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||||||
ggml_tensor *dst, const float *src0_dd,
|
ggml_tensor *dst, const float *src0_dd,
|
||||||
const float *src1_dd, float *dst_dd,
|
const float *src1_dd, float *dst_dd,
|
||||||
const dpct::queue_ptr &main_stream) {
|
const dpct::queue_ptr &main_stream) {
|
||||||
#pragma message("TODO: implement phi3 frequency factors support")
|
const ggml_tensor * src2 = dst->src[2];
|
||||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
|
||||||
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
@ -14014,6 +14079,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
|
const float * freq_factors = nullptr;
|
||||||
const int32_t * pos = nullptr;
|
const int32_t * pos = nullptr;
|
||||||
if ((mode & 1) == 0) {
|
if ((mode & 1) == 0) {
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
@ -14024,6 +14090,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
if (is_neox) {
|
||||||
|
pos = (const int32_t *) src1_dd;
|
||||||
|
|
||||||
|
if (src2 != nullptr) {
|
||||||
|
freq_factors = (const float *) src2->data;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
||||||
|
}
|
||||||
|
|
||||||
rope_corr_dims corr_dims;
|
rope_corr_dims corr_dims;
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||||
|
|
||||||
@ -14035,13 +14111,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
rope_neox_sycl(
|
rope_neox_sycl(
|
||||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
attn_factor, corr_dims, main_stream
|
attn_factor, corr_dims, freq_factors, main_stream
|
||||||
);
|
);
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
||||||
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
||||||
freq_base, ext_factor, attn_factor, corr_dims,
|
freq_base, ext_factor, attn_factor, corr_dims,
|
||||||
main_stream);
|
freq_factors, main_stream);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
@ -15243,6 +15319,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
||||||
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||||
|
|
||||||
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
||||||
use_mul_mat_q = false;
|
use_mul_mat_q = false;
|
||||||
@ -15434,22 +15511,86 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct mmid_row_mapping {
|
||||||
|
int32_t i1;
|
||||||
|
int32_t i2;
|
||||||
|
};
|
||||||
|
|
||||||
|
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
||||||
|
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
||||||
|
int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
|
||||||
|
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
||||||
|
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
||||||
|
const sycl::nd_item<3> &item_ct1, int &src1_row) {
|
||||||
|
int32_t iid1 = item_ct1.get_group(2);
|
||||||
|
int32_t id = item_ct1.get_group(1);
|
||||||
|
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
||||||
|
|
||||||
|
if (row_id_i != i02) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t i11 = id % ne11;
|
||||||
|
const int64_t i12 = iid1;
|
||||||
|
|
||||||
|
if (item_ct1.get_local_id(2) == 0) {
|
||||||
|
src1_row =
|
||||||
|
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
||||||
|
cur_src1_row, 1);
|
||||||
|
row_mapping[src1_row] = {id, iid1};
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
||||||
|
performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
|
||||||
|
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
||||||
|
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = item_ct1.get_local_id(2); i < ne10;
|
||||||
|
i += item_ct1.get_local_range(2)) {
|
||||||
|
src1_row_contiguous[i] = src1_row_original[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__dpct_inline__ static void k_copy_dst_from_contiguous(
|
||||||
|
char *__restrict__ dst_original, const char *__restrict__ dst_contiguous,
|
||||||
|
const mmid_row_mapping *__restrict__ row_mapping, int64_t ne0, size_t nb1,
|
||||||
|
size_t nb2, const sycl::nd_item<3> &item_ct1) {
|
||||||
|
int32_t i = item_ct1.get_group(2);
|
||||||
|
|
||||||
|
const int32_t i1 = row_mapping[i].i1;
|
||||||
|
const int32_t i2 = row_mapping[i].i2;
|
||||||
|
|
||||||
|
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
||||||
|
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = item_ct1.get_local_id(2); j < ne0;
|
||||||
|
j += item_ct1.get_local_range(2)) {
|
||||||
|
dst_row_original[j] = dst_row_contiguous[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1,
|
const ggml_tensor *src1,
|
||||||
ggml_tensor *dst) try {
|
ggml_tensor *dst) try {
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
|
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||||
"mul_mat_id does not support split buffers");
|
|
||||||
const ggml_tensor *ids = dst->src[2];
|
const ggml_tensor *ids = dst->src[2];
|
||||||
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
||||||
|
|
||||||
const size_t nb11 = src1->nb[1];
|
const int64_t n_as = ne02;
|
||||||
const size_t nb1 = dst->nb[1];
|
const int64_t n_ids = ids->ne[0];
|
||||||
|
|
||||||
const int32_t id = ((int32_t *)dst->op_params)[0];
|
|
||||||
const int32_t n_as = src0->ne[2];
|
|
||||||
|
|
||||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||||
const char *ids_dev = (const char *)ids->data;
|
const char * ids_dev = (const char *) ids->data;
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||||
@ -15489,25 +15630,41 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|||||||
|
|
||||||
src0_row.ne[2] = 1;
|
src0_row.ne[2] = 1;
|
||||||
src0_row.ne[3] = 1;
|
src0_row.ne[3] = 1;
|
||||||
src0_row.nb[3] = src0->nb[2];
|
src0_row.nb[3] = nb02;
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
src1_row.ne[1] = 1;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
src1_row.ne[2] = 1;
|
||||||
const int32_t row_id =
|
src1_row.ne[3] = 1;
|
||||||
*(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
|
src1_row.nb[2] = nb11;
|
||||||
id * ids->nb[0]);
|
src1_row.nb[3] = nb11;
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
dst_row.ne[1] = 1;
|
||||||
|
dst_row.ne[2] = 1;
|
||||||
|
dst_row.ne[3] = 1;
|
||||||
|
dst_row.nb[2] = nb1;
|
||||||
|
dst_row.nb[3] = nb1;
|
||||||
|
if (ne12 == 1) {
|
||||||
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||||
|
for (int64_t id = 0; id < n_ids; id++) {
|
||||||
|
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||||
|
|
||||||
|
const int64_t i11 = id % ne11;
|
||||||
|
const int64_t i12 = iid1;
|
||||||
|
|
||||||
|
const int64_t i1 = id;
|
||||||
|
const int64_t i2 = i12;
|
||||||
|
|
||||||
src0_row_extra.data_device[g_main_device] =
|
src0_row_extra.data_device[g_main_device] =
|
||||||
src0_original + row_id * src0->nb[2];
|
src0_original + i02*nb02;
|
||||||
src1_row_extra.data_device[g_main_device] =
|
src1_row_extra.data_device[g_main_device] =
|
||||||
src1_original + i01 * src1->nb[1];
|
src1_original + + i11*nb11 + i12*nb12;
|
||||||
dst_row_extra.data_device[g_main_device] =
|
dst_row_extra.data_device[g_main_device] =
|
||||||
dst_original + i01 * dst->nb[1];
|
dst_original + i1*nb1 + i2*nb2;
|
||||||
|
|
||||||
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
||||||
sycl_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
sycl_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
||||||
@ -15515,64 +15672,98 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|||||||
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||||
int64_t num_src1_rows = 0;
|
int64_t num_src1_rows = 0;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
for (int64_t id = 0; id < n_ids; id++) {
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
if (row_id_i != row_id) {
|
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||||
|
|
||||||
|
if (row_id_i != i02) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
||||||
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
|
||||||
src1_original + i01 * nb11, nb11)));
|
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (num_src1_rows == 0) {
|
if (num_src1_rows == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
src0_row_extra.data_device[g_main_device] =
|
|
||||||
src0_original + row_id * src0->nb[2];
|
|
||||||
|
|
||||||
|
sycl_pool_alloc<int> dev_cur_src1_row(1);
|
||||||
|
sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(num_src1_rows);
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
|
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
||||||
|
|
||||||
|
{
|
||||||
|
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
|
||||||
|
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
||||||
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
||||||
|
|
||||||
|
char *__restrict src1_contiguous_get =
|
||||||
|
src1_contiguous.get();
|
||||||
|
int *__restrict dev_cur_src1_row_get =
|
||||||
|
dev_cur_src1_row.get();
|
||||||
|
mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||||
|
dev_row_mapping.get();
|
||||||
|
size_t ids_nb_ct6 = ids->nb[1];
|
||||||
|
size_t ids_nb_ct7 = ids->nb[0];
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
k_copy_src1_to_contiguous(
|
||||||
|
src1_original, src1_contiguous_get,
|
||||||
|
dev_cur_src1_row_get,
|
||||||
|
dev_row_mapping_get, ids_dev, i02,
|
||||||
|
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
||||||
|
item_ct1, src1_row_acc);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
src0_row_extra.data_device[g_main_device] = src0_original + i02*nb02;
|
||||||
|
|
||||||
|
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||||
|
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||||
src1_row.ne[1] = num_src1_rows;
|
src1_row.ne[1] = num_src1_rows;
|
||||||
dst_row.ne[1] = num_src1_rows;
|
|
||||||
|
|
||||||
src1_row.nb[1] = nb11;
|
src1_row.nb[1] = nb11;
|
||||||
src1_row.nb[2] = num_src1_rows*nb11;
|
src1_row.nb[2] = num_src1_rows*nb11;
|
||||||
src1_row.nb[3] = num_src1_rows*nb11;
|
src1_row.nb[3] = num_src1_rows*nb11;
|
||||||
|
|
||||||
|
dst_row.ne[1] = num_src1_rows;
|
||||||
dst_row.nb[1] = nb1;
|
dst_row.nb[1] = nb1;
|
||||||
dst_row.nb[2] = num_src1_rows*nb1;
|
dst_row.nb[2] = num_src1_rows*nb1;
|
||||||
dst_row.nb[3] = num_src1_rows*nb1;
|
dst_row.nb[3] = num_src1_rows*nb1;
|
||||||
|
|
||||||
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
|
|
||||||
num_src1_rows = 0;
|
{
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
|
||||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
||||||
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
const char *__restrict dst_contiguous_get =
|
||||||
|
dst_contiguous.get();
|
||||||
|
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||||
|
dev_row_mapping.get();
|
||||||
|
|
||||||
if (row_id_i != row_id) {
|
cgh.parallel_for(
|
||||||
continue;
|
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||||
}
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
k_copy_dst_from_contiguous(dst_original,
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
dst_contiguous_get,
|
||||||
|
dev_row_mapping_get,
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
ne0, nb1, nb2, item_ct1);
|
||||||
dst_original + i01 * nb1,
|
});
|
||||||
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
});
|
||||||
num_src1_rows++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
@ -16555,10 +16746,9 @@ GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backe
|
|||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// unused at the moment
|
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
||||||
//static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
||||||
// return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
}
|
||||||
//}
|
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||||
|
@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
||||||
|
ggml_vk_instance_init();
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
65
ggml.c
65
ggml.c
@ -4882,10 +4882,21 @@ struct ggml_tensor * ggml_repeat_back(
|
|||||||
// ggml_concat
|
// ggml_concat
|
||||||
|
|
||||||
struct ggml_tensor * ggml_concat(
|
struct ggml_tensor * ggml_concat(
|
||||||
struct ggml_context* ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor* a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor* b) {
|
struct ggml_tensor * b,
|
||||||
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
int dim) {
|
||||||
|
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||||||
|
if (d == dim) {
|
||||||
|
ne[d] = a->ne[d] + b->ne[d];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||||||
|
ne[d] = a->ne[d];
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
@ -4893,7 +4904,9 @@ struct ggml_tensor * ggml_concat(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, dim);
|
||||||
|
|
||||||
result->op = GGML_OP_CONCAT;
|
result->op = GGML_OP_CONCAT;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -5013,6 +5026,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
||||||
|
|
||||||
result->op = GGML_OP_LEAKY_RELU;
|
result->op = GGML_OP_LEAKY_RELU;
|
||||||
@ -10967,34 +10981,37 @@ static void ggml_compute_forward_concat_f32(
|
|||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
|
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||||
|
|
||||||
|
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||||
|
|
||||||
|
int64_t o[4] = {0, 0, 0, 0};
|
||||||
|
o[dim] = src0->ne[dim];
|
||||||
|
|
||||||
|
const float * x;
|
||||||
|
|
||||||
|
// TODO: smarter multi-theading
|
||||||
for (int i3 = 0; i3 < ne3; i3++) {
|
for (int i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||||
if (i2 < ne02) { // src0
|
|
||||||
for (int i1 = 0; i1 < ne1; i1++) {
|
for (int i1 = 0; i1 < ne1; i1++) {
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
for (int i0 = 0; i0 < ne0; i0++) {
|
||||||
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||||
|
} else {
|
||||||
|
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||||
|
}
|
||||||
|
|
||||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||||
*y = *x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // src1
|
|
||||||
else {
|
|
||||||
for (int i1 = 0; i1 < ne1; i1++) {
|
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
|
||||||
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
|
||||||
|
|
||||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
|
||||||
*y = *x;
|
*y = *x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_concat(
|
static void ggml_compute_forward_concat(
|
||||||
const struct ggml_compute_params* params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor* dst) {
|
struct ggml_tensor* dst) {
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
@ -22742,6 +22759,16 @@ int ggml_cpu_has_neon(void) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_sve(void) {
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
// TODO: Currently, SVE 256 bit is only supported.
|
||||||
|
GGML_ASSERT(svcntb() == QK8_0);
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_arm_fma(void) {
|
int ggml_cpu_has_arm_fma(void) {
|
||||||
#if defined(__ARM_FEATURE_FMA)
|
#if defined(__ARM_FEATURE_FMA)
|
||||||
return 1;
|
return 1;
|
||||||
|
6
ggml.h
6
ggml.h
@ -1007,12 +1007,13 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// concat a and b on dim 2
|
// concat a and b along dim
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
GGML_API struct ggml_tensor * ggml_concat(
|
GGML_API struct ggml_tensor * ggml_concat(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int dim);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_abs(
|
GGML_API struct ggml_tensor * ggml_abs(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
@ -2404,6 +2405,7 @@ extern "C" {
|
|||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
GGML_API int ggml_cpu_has_fma (void);
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
GGML_API int ggml_cpu_has_neon (void);
|
||||||
|
GGML_API int ggml_cpu_has_sve (void);
|
||||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_API int ggml_cpu_has_metal (void);
|
GGML_API int ggml_cpu_has_metal (void);
|
||||||
GGML_API int ggml_cpu_has_f16c (void);
|
GGML_API int ggml_cpu_has_f16c (void);
|
||||||
|
@ -37,11 +37,15 @@ class Keys:
|
|||||||
CONTEXT_LENGTH = "{arch}.context_length"
|
CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
BLOCK_COUNT = "{arch}.block_count"
|
BLOCK_COUNT = "{arch}.block_count"
|
||||||
|
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
||||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
|
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
||||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
|
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
||||||
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
|
||||||
@ -55,6 +59,8 @@ class Keys:
|
|||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
@ -64,6 +70,7 @@ class Keys:
|
|||||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
@ -139,6 +146,8 @@ class MODEL_ARCH(IntEnum):
|
|||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
|
ARCTIC = auto()
|
||||||
|
DEEPSEEK2 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
@ -167,6 +176,7 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_ACT = auto()
|
FFN_ACT = auto()
|
||||||
|
FFN_NORM_EXP = auto()
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
@ -183,6 +193,12 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
|
ATTN_Q_A = auto()
|
||||||
|
ATTN_Q_B = auto()
|
||||||
|
ATTN_KV_A_MQA = auto()
|
||||||
|
ATTN_KV_B = auto()
|
||||||
|
ATTN_Q_A_NORM = auto()
|
||||||
|
ATTN_KV_A_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
@ -218,6 +234,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
@ -251,6 +269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
@ -262,6 +281,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||||
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
@ -733,6 +758,54 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.ARCTIC: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.DEEPSEEK2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A,
|
||||||
|
MODEL_TENSOR.ATTN_Q_B,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||||
|
MODEL_TENSOR.ATTN_KV_B,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -766,6 +839,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DEEPSEEK2: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
from .quants import quant_shape_to_byte_shape
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -251,6 +253,7 @@ class GGUFReader:
|
|||||||
tensor_names.add(tensor_name)
|
tensor_names.add(tensor_name)
|
||||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||||
n_elems = int(np.prod(dims))
|
n_elems = int(np.prod(dims))
|
||||||
|
np_dims = tuple(reversed(dims.tolist()))
|
||||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
n_bytes = n_elems * type_size // block_size
|
n_bytes = n_elems * type_size // block_size
|
||||||
data_offs = int(start_offs + offset_tensor[0])
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
@ -279,6 +282,7 @@ class GGUFReader:
|
|||||||
else:
|
else:
|
||||||
item_count = n_bytes
|
item_count = n_bytes
|
||||||
item_type = np.uint8
|
item_type = np.uint8
|
||||||
|
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||||
tensors.append(ReaderTensor(
|
tensors.append(ReaderTensor(
|
||||||
name = tensor_name,
|
name = tensor_name,
|
||||||
tensor_type = ggml_type,
|
tensor_type = ggml_type,
|
||||||
@ -286,7 +290,7 @@ class GGUFReader:
|
|||||||
n_elements = n_elems,
|
n_elements = n_elems,
|
||||||
n_bytes = n_bytes,
|
n_bytes = n_bytes,
|
||||||
data_offset = data_offs,
|
data_offset = data_offs,
|
||||||
data = self._get(data_offs, item_type, item_count),
|
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||||
field = field,
|
field = field,
|
||||||
))
|
))
|
||||||
self.tensors = tensors
|
self.tensors = tensors
|
||||||
|
@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .constants import (
|
from .constants import (
|
||||||
GGML_QUANT_SIZES,
|
|
||||||
GGUF_DEFAULT_ALIGNMENT,
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
GGUF_MAGIC,
|
GGUF_MAGIC,
|
||||||
GGUF_VERSION,
|
GGUF_VERSION,
|
||||||
@ -26,6 +25,8 @@ from .constants import (
|
|||||||
TokenType,
|
TokenType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .quants import quant_shape_from_byte_shape
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -229,10 +230,7 @@ class GGUFWriter:
|
|||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
if tensor_shape[-1] % type_size != 0:
|
|
||||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
|
||||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
|
||||||
n_dims = len(tensor_shape)
|
n_dims = len(tensor_shape)
|
||||||
self.ti_data += self._pack("I", n_dims)
|
self.ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
@ -376,9 +374,15 @@ class GGUFWriter:
|
|||||||
def add_block_count(self, length: int) -> None:
|
def add_block_count(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_leading_dense_block_count(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_feed_forward_length(self, length: int) -> None:
|
def add_feed_forward_length(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_expert_feed_forward_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
@ -409,6 +413,12 @@ class GGUFWriter:
|
|||||||
def add_expert_used_count(self, count: int) -> None:
|
def add_expert_used_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_shared_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_weights_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
@ -418,6 +428,12 @@ class GGUFWriter:
|
|||||||
def add_causal_attention(self, value: bool) -> None:
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_q_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
@ -442,6 +458,9 @@ class GGUFWriter:
|
|||||||
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_conv_kernel(self, value: int) -> None:
|
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Callable
|
from typing import Callable, Sequence
|
||||||
|
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % block_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||||
|
@ -244,6 +244,7 @@ class TensorNameMap:
|
|||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
@ -255,6 +256,7 @@ class TensorNameMap:
|
|||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
@ -272,6 +274,7 @@ class TensorNameMap:
|
|||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
@ -283,6 +286,7 @@ class TensorNameMap:
|
|||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
@ -306,6 +310,7 @@ class TensorNameMap:
|
|||||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||||
|
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
@ -317,6 +322,7 @@ class TensorNameMap:
|
|||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
@ -380,6 +386,42 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_B: (
|
||||||
|
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_B: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# architecture-specific block mappings
|
||||||
|
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
||||||
|
MODEL_ARCH.ARCTIC: {
|
||||||
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
|
"model.layers.{bid}.residual_layernorm",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP: (
|
||||||
|
"model.layers.{bid}.post_attention_layernorm",
|
||||||
|
),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
@ -393,12 +435,14 @@ class TensorNameMap:
|
|||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
self.mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
if arch in self.arch_block_mappings_cfg:
|
||||||
|
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
for tensor, keys in self.block_mappings_cfg.items():
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 60
|
n_experts = 160
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
|||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
total_bytes += tensor.n_bytes
|
total_bytes += tensor.n_bytes
|
||||||
# Dimensions are written in reverse order, so flip them first
|
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||||
shape = np.flipud(tensor.shape).tolist()
|
|
||||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
|
||||||
|
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
18
llama.h
18
llama.h
@ -85,6 +85,7 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
@ -264,6 +265,8 @@ extern "C" {
|
|||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
uint32_t n_ctx; // text context, 0 = from model
|
uint32_t n_ctx; // text context, 0 = from model
|
||||||
@ -290,14 +293,14 @@ extern "C" {
|
|||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_user_data;
|
||||||
|
|
||||||
enum ggml_type type_k; // data type for K cache
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // whether to use flash attention
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
@ -759,6 +762,12 @@ extern "C" {
|
|||||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||||
|
|
||||||
|
// Get the number of threads used for generation of a single token.
|
||||||
|
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
// If set to true, the model will only attend to the past tokens
|
// If set to true, the model will only attend to the past tokens
|
||||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
@ -817,6 +826,9 @@ extern "C" {
|
|||||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
|
// Identify if Token Id is a control token or a render-able token
|
||||||
|
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
|
@ -1259,22 +1259,26 @@ struct test_im2col : public test_case {
|
|||||||
// GGML_OP_CONCAT
|
// GGML_OP_CONCAT
|
||||||
struct test_concat : public test_case {
|
struct test_concat : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
const std::array<int64_t, 4> ne;
|
const std::array<int64_t, 4> ne_a;
|
||||||
const int64_t b_ne2;
|
const int64_t ne_b_d;
|
||||||
|
const int dim;
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR3(type, ne, b_ne2);
|
return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_concat(ggml_type type = GGML_TYPE_F32,
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
||||||
int64_t b_ne2 = 10)
|
int64_t ne_b_d = 10,
|
||||||
: type(type), ne(ne), b_ne2(b_ne2) {}
|
int dim = 2)
|
||||||
|
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
auto ne_b = ne_a;
|
||||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
|
ne_b[dim] = ne_b_d;
|
||||||
ggml_tensor * out = ggml_concat(ctx, a, b);
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||||
|
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
||||||
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -2211,8 +2215,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
for (int dim : { 0, 1, 2, 3, }) {
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
|
||||||
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
|
||||||
|
}
|
||||||
|
|
||||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||||
|
@ -49,8 +49,14 @@ int main(void) {
|
|||||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
||||||
// Llama-3
|
// Llama-3
|
||||||
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||||
// Phi-3
|
//Phi-3-mini
|
||||||
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
|
//Phi-3-small
|
||||||
|
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||||
|
//Phi-3-medium
|
||||||
|
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
|
//Phi-3-vision
|
||||||
|
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
@ -79,8 +85,14 @@ int main(void) {
|
|||||||
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||||
// Llama 3
|
// Llama 3
|
||||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
// Phi 3
|
//Phi-3-mini
|
||||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-small
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-medium
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-vision
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
@ -28,6 +28,8 @@ printf "Tokenizing using (cpp) llama.cpp ...\n"
|
|||||||
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
||||||
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
|
||||||
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user