2023-06-06 21:33:23 +02:00
|
|
|
#pragma once
|
|
|
|
|
2023-04-29 01:31:56 +02:00
|
|
|
#include "ggml.h"
|
2023-04-21 21:59:17 +02:00
|
|
|
|
2023-04-20 03:14:14 +02:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2023-06-06 21:33:23 +02:00
|
|
|
#define GGML_CUDA_MAX_DEVICES 16
|
|
|
|
|
|
|
|
struct ggml_tensor_extra_gpu {
|
|
|
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
|
|
|
};
|
|
|
|
|
2023-05-01 18:11:07 +02:00
|
|
|
void ggml_init_cublas(void);
|
2023-06-06 21:33:23 +02:00
|
|
|
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
2023-04-21 21:59:17 +02:00
|
|
|
|
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
|
|
|
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
2023-05-01 18:11:07 +02:00
|
|
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
|
|
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
|
|
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
2023-04-21 21:59:17 +02:00
|
|
|
|
2023-05-01 18:11:07 +02:00
|
|
|
// TODO: export these with GGML_API
|
2023-04-29 02:04:18 +02:00
|
|
|
void * ggml_cuda_host_malloc(size_t size);
|
|
|
|
void ggml_cuda_host_free(void * ptr);
|
|
|
|
|
2023-06-12 14:44:16 +02:00
|
|
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
|
|
|
2023-06-06 21:33:23 +02:00
|
|
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
|
|
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
|
|
|
void ggml_cuda_set_main_device(int main_device);
|
|
|
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
|
|
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
2023-05-13 15:38:36 +02:00
|
|
|
|
2023-04-20 03:14:14 +02:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|