# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
RUN make
+++ /dev/null
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-# We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-# It is up to the user to install the correct vendor-specific support.
-
-Name: llama.cpp-cublas
-Version: %( date "+%%Y%%m%%d" )
-Release: 1%{?dist}
-Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License: MIT
-Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires: coreutils make gcc-c++ git cuda-toolkit
-Requires: cuda-toolkit
-URL: https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CUBLAS=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- /dev/null
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp-cuda
+Version: %( date "+%%Y%%m%%d" )
+Release: 1%{?dist}
+Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License: MIT
+Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git cuda-toolkit
+Requires: cuda-toolkit
+URL: https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CUDA=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
+/usr/lib/systemd/system/llamacuda.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
RUN make
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
- (cmakeBool "LLAMA_CUBLAS" useCuda)
+ (cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
RUN make
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
- windows-latest-cmake-cublas:
+ windows-latest-cmake-cuda:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.2.0', '11.7.1']
- build: ['cublas']
+ build: ['cuda']
steps:
- name: Clone
run: |
mkdir build
cd build
- cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+ cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- - windows-latest-cmake-cublas
+ - windows-latest-cmake-cuda
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS "llama: use CUDA" OFF)
-#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
+option(LLAMA_CUDA "llama: use CUDA" OFF)
+option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
endif()
if (LLAMA_CUBLAS)
+ message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
+ set(LLAMA_CUDA ON)
+endif()
+
+if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
- message(STATUS "cuBLAS found")
+ message(STATUS "CUDA found")
enable_language(CUDA)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
- add_compile_definitions(GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
- message(WARNING "cuBLAS not found")
+ message(WARNING "CUDA not found")
endif()
endif()
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
- add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
set(CUDA_CXX_FLAGS "")
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
set(CUDA_FLAGS -use_fast_math)
if (LLAMA_FATAL_WARNINGS)
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
+# LLAMA_CUBLAS is deprecated and will be removed in the future
+ LLAMA_CUDA := 1
+endif
+
+ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
$(info )
+ifdef LLAMA_CUBLAS
+$(info !!!!)
+$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
#
# Build library
#
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-- #### cuBLAS
+- #### CUDA
- This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+ This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`:
```bash
- make LLAMA_CUBLAS=1
+ make LLAMA_CUDA=1
```
- Using `CMake`:
```bash
mkdir build
cd build
- cmake .. -DLLAMA_CUBLAS=ON
+ cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
-<!---
- | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---->
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
set -e
- (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
- (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUDA_SYCL
#endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUDA_SYCL_VULKAN
#endif
#if defined(LLAMA_USE_CURL)
return true;
}
params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
invalid_param = true;
return true;
}
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {
params.tensor_split[i] = 0.0f;
}
}
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--no-mmap") {
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
+ fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
# Token generation performance troubleshooting
-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with CUDA
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```
## Example
```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
# generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
static std::string get_gpu_info() {
std::string id;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
-const bool test::cuda = !!ggml_cpu_has_cublas();
+const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
## Orin compile and run
### compile
```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
```
### run on Orin
#include "ggml-alloc.h"
#include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
}
}
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__);
#endif
### Considerations
-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
-- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
-- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
invalid_param = true;
break;
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
}
}
#else
- LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+ LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUDA
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
- LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+ LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
#endif
} else if (arg == "--lora") {
if (++i >= argc) {
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
// add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
ggml_backend_cuda_reg_devices();
#endif
}
int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+int ggml_cpu_has_cuda(void) {
+#if defined(GGML_USE_CUDA)
return 1;
#else
return 0;
}
int ggml_cpu_has_gpublas(void) {
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl();
}
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
- GGML_API int ggml_cpu_has_cublas (void);
+ GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
#include "ggml-alloc.h"
#include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h"
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
// host buffers should only be used when data is expected to be copied to/from the GPU
if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type();
#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
}
static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
}
static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free);
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
}
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
size_t llama_max_devices(void) {
#if defined(GGML_USE_METAL)
return 1;
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL)
return GGML_SYCL_MAX_DEVICES;
}
bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
}
ctx->backends.push_back(ctx->backend_metal);
}
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events
// currently this is only implemented in the CUDA backend
pipeline_parallel = false;
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
+set(LLAMA_CUDA @LLAMA_CUDA@)
set(LLAMA_METAL @LLAMA_METAL@)
set(LLAMA_MPI @LLAMA_MPI@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
find_package(BLAS REQUIRED)
endif()
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
make_opts=""
if [[ "$backend" == "cuda" ]]; then
- make_opts="LLAMA_CUBLAS=1"
+ make_opts="LLAMA_CUDA=1"
fi
git checkout $1
cd llama.cpp
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
mkdir build-cublas
cd build-cublas
-cmake -DLLAMA_CUBLAS=1 ../
+cmake -DLLAMA_CUDA=1 ../
make -j
if [ "$1" -eq "0" ]; then
# batched
cd /workspace/llama.cpp
- LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
+ LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
# batched-bench
cd /workspace/llama.cpp
- LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
+ LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
# parallel
cd /workspace/llama.cpp
- LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
+ LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
fi
#if [ "$1" -eq "7" ]; then
# cd /workspace/llama.cpp
#
-# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
+# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
#fi
# more benches
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n"
- LLAMA_CUBLAS=1 make -j server $log
+ LLAMA_CUDA=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n"
make -j server $log