cuda : rename build flag to LLAMA_CUDA (#6299)

author slaren <redacted>

Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)

committer GitHub <redacted>

Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)
author slaren <redacted>
Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)
committer GitHub <redacted>
Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile

index 77a9ddc145d0bfd41e5bbf9066bd6109964b0c47..8cc1480d316c2501fbd6948e3b4b9cec0b73278b 100644 (file)
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -26,8 +26,8 @@ COPY . .
  
  # Set nvcc architecture
  ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
  
  RUN make
  
diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec

deleted file mode 100644 (file)

index f847ebb..0000000
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ /dev/null
@@ -1,83 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-cublas
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
-Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CUBLAS=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec

new file mode 100644 (file)

index 0000000..66bdc87
--- /dev/null
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -0,0 +1,83 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp-cuda
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
+Requires:       cuda-toolkit
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CUDA=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
+/usr/lib/systemd/system/llamacuda.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile

index 2b7faf7c11c0bbf3ea039413a29219c6fa92a5bb..b937a482988b6cd2f87acbcdafa7a9a1fada78c3 100644 (file)
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
  
  # Set nvcc architecture
  ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
  
  RUN make
  
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix

index 76d96e63cd6eb6226d944140ae2ed2c9f462e0a8..0330b79d9143a2344cc4b228f36fed46831d9241 100644 (file)
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -192,7 +192,7 @@ effectiveStdenv.mkDerivation (
          (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
          (cmakeBool "LLAMA_BLAS" useBlas)
          (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
+        (cmakeBool "LLAMA_CUDA" useCuda)
          (cmakeBool "LLAMA_HIPBLAS" useRocm)
          (cmakeBool "LLAMA_METAL" useMetalKit)
          (cmakeBool "LLAMA_MPI" useMpi)
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile

index 4f83904bc9ff05e5a1219504a3f6c7fb255e65dc..5683a364652b199ad2981f90824b04f8369ce746 100644 (file)
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
  
  # Set nvcc architecture
  ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
  
  RUN make
  
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml

index 0e7643bbaa6a098b959d917fd407698fc140c1d3..9329b94ee9e60199c3e1687962755a2ce07e16ed 100644 (file)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -728,13 +728,13 @@ jobs:
            path: |
              llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
  
-  windows-latest-cmake-cublas:
+  windows-latest-cmake-cuda:
      runs-on: windows-latest
  
      strategy:
        matrix:
          cuda: ['12.2.0', '11.7.1']
-        build: ['cublas']
+        build: ['cuda']
  
      steps:
        - name: Clone
@@ -755,7 +755,7 @@ jobs:
          run: |
            mkdir build
            cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
            cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
  
        - name: Determine tag name
@@ -911,7 +911,7 @@ jobs:
        - macOS-latest-make
        - macOS-latest-cmake
        - windows-latest-cmake
-      - windows-latest-cmake-cublas
+      - windows-latest-cmake-cuda
        - macOS-latest-cmake-arm64
        - macOS-latest-cmake-x64
  
diff --git a/CMakeLists.txt b/CMakeLists.txt

index b25cfd2fc0e179bffbc11ae95f943000e82f6322..3f23ba4d369336db1b8406ae7de693f87ed93dc5 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,8 +89,8 @@ endif()
  option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
  option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
  set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
-#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
+option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
  option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
  option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
  set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
  endif()
  
  if (LLAMA_CUBLAS)
+    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
+    set(LLAMA_CUDA ON)
+endif()
+
+if (LLAMA_CUDA)
      cmake_minimum_required(VERSION 3.17)
  
      find_package(CUDAToolkit)
      if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
+        message(STATUS "CUDA found")
  
          enable_language(CUDA)
  
@@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
          file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
          list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
  
-        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_CUDA)
          if (LLAMA_CUDA_FORCE_DMMV)
              add_compile_definitions(GGML_CUDA_FORCE_DMMV)
          endif()
@@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
      message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
  
      else()
-        message(WARNING "cuBLAS not found")
+        message(WARNING "CUDA not found")
      endif()
  endif()
  
@@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
      file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
      list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
  
-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
  
      if (LLAMA_HIP_UMA)
          add_compile_definitions(GGML_HIP_UMA)
@@ -830,7 +835,7 @@ endif()
  
  set(CUDA_CXX_FLAGS "")
  
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
      set(CUDA_FLAGS -use_fast_math)
  
      if (LLAMA_FATAL_WARNINGS)
@@ -1055,7 +1060,7 @@ endif()
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
  add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
  
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
      list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
      list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
      if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
diff --git a/Makefile b/Makefile

index 08eafb1e748282790d64c5567029923f556e7be4..1741151eb8a99fa2f9dbeee3885f5415678191c4 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
  endif # LLAMA_BLIS
  
  ifdef LLAMA_CUBLAS
+# LLAMA_CUBLAS is deprecated and will be removed in the future
+       LLAMA_CUDA := 1
+endif
+
+ifdef LLAMA_CUDA
         ifneq ('', '$(wildcard /opt/cuda)')
                 CUDA_PATH ?= /opt/cuda
         else
                 CUDA_PATH ?= /usr/local/cuda
         endif
-       MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+       MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
         MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
         OBJS         += ggml-cuda.o
         OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -462,7 +467,7 @@ endif
  
  ifdef JETSON_EOL_MODULE_DETECT
  define NVCC_COMPILE
-       $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+       $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
  endef # NVCC_COMPILE
  else
  define NVCC_COMPILE
@@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
         $(NVCC_COMPILE)
  
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
  
  ifdef LLAMA_CLBLAST
  
@@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
         LLAMA_CUDA_DMMV_X       ?= 32
         LLAMA_CUDA_MMV_Y        ?= 1
         LLAMA_CUDA_KQUANTS_ITER ?= 2
-       MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+       MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
  ifdef LLAMA_HIP_UMA
         MK_CPPFLAGS += -DGGML_HIP_UMA
  endif # LLAMA_HIP_UMA
@@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
  override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
  
  # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
  include scripts/get-flags.mk
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
  $(info I LDFLAGS:   $(LDFLAGS))
  $(info I CC:        $(shell $(CC)   --version | head -n 1))
  $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
  $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
  endif # CUDA_POWER_ARCH
  endif # CUDA_DOCKER_ARCH
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
  $(info )
  
+ifdef LLAMA_CUBLAS
+$(info !!!!)
+$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
  #
  # Build library
  #
diff --git a/README.md b/README.md

index f9cf1961629d002af71629fdccd66a1bc441185c..ce678f0c36bd9d352c44732ac31d4be28b3fd17e 100644 (file)
--- a/README.md
+++ b/README.md
@@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
  
    Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
  
-- #### cuBLAS
+- #### CUDA
  
-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  
    For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
  
    - Using `make`:
      ```bash
-    make LLAMA_CUBLAS=1
+    make LLAMA_CUDA=1
      ```
    - Using `CMake`:
  
      ```bash
      mkdir build
      cd build
-    cmake .. -DLLAMA_CUBLAS=ON
+    cmake .. -DLLAMA_CUDA=ON
      cmake --build . --config Release
      ```
  
    The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
  
-<!---
-  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---->
    | Option                         | Legal values           | Default | Description |
    |--------------------------------|------------------------|---------|-------------|
    | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
diff --git a/ci/run.sh b/ci/run.sh

index 51f4c74cc2cf5bc9abe0dbcdef1965b68112a5b9..85acc46d3939c09494884ac29637a606bfd0d596 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
  fi
  
  if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
  fi
  
  if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
  
      set -e
  
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
  
      python3 ../convert.py ${path_models}
  
diff --git a/common/common.cpp b/common/common.cpp

index 9dec084303dc7bddb785bdbc64d47915e9b4f2af..5fd33e2a1e0979e0b4bd77de93c964e1e7168dbf 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -48,12 +48,12 @@
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
  
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUDA_SYCL
  #endif
  
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUDA_SYCL_VULKAN
  #endif
  
  #if defined(LLAMA_USE_CURL)
@@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
              return true;
          }
          params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
          return true;
      }
      if (arg == "--split-mode" || arg == "-sm") {
@@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
              invalid_param = true;
              return true;
          }
-#ifndef GGML_USE_CUBLAS_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
          return true;
      }
      if (arg == "--tensor-split" || arg == "-ts") {
@@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
                  params.tensor_split[i] = 0.0f;
              }
          }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
          return true;
      }
      if (arg == "--no-mmap") {
@@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
      fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
      fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
      fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
+    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
      fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
      fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
      fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md

index d7e863dff5c01b9eee4d6d55839da0860752735d..3c43431471243bbfe1dd9455ef035231ceceb683 100644 (file)
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -1,7 +1,7 @@
  # Token generation performance troubleshooting
  
-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with CUDA
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
  ```shell
  ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
  ```
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md

index 578e8fc27177a3b31465a22dd1495a990800abe2..458c01b8751f17097a24623c00319ad7f65b577b 100644 (file)
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
  ## Example
  
  ```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
  
  # generate importance matrix (imatrix.dat)
  ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp

index 82413b79d025500332d2ecc2b841068196e75e26..27e113203f1ea11f3ddc0ad00d7753dc057638f3 100644 (file)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -113,7 +113,7 @@ static std::string get_cpu_info() {
  
  static std::string get_gpu_info() {
      std::string id;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
      int count = ggml_backend_cuda_get_device_count();
      for (int i = 0; i < count; i++) {
          char buf[128];
@@ -808,7 +808,7 @@ struct test {
  
  const std::string test::build_commit = LLAMA_COMMIT;
  const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::cuda         = !!ggml_cpu_has_cuda();
  const bool        test::opencl       = !!ggml_cpu_has_clblast();
  const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
  const bool        test::kompute      = !!ggml_cpu_has_kompute();
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md

index 4d5fef020f4068fb0abed7c8fa6b5efe09f1fda7..b3b66331fd9dfc3019f064cea95106275effedf4 100644 (file)
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -124,7 +124,7 @@ llama_print_timings:       total time =   34570.79 ms
  ## Orin compile and run
  ### compile
  ```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
  ```
  
  ### run on Orin
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 48caafa872aed17261d1b53ccba514d94cf244b8..40c9762617cfd79ca6147ae5269ae08d93fbfc77 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -7,7 +7,7 @@
  #include "ggml-alloc.h"
  #include "ggml-backend.h"
  
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
  #include "ggml-cuda.h"
  #endif
  
@@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
          }
      }
  
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
      new_clip->backend = ggml_backend_cuda_init(0);
      printf("%s: CLIP using CUDA backend\n", __func__);
  #endif
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md

index 6d665f28fe9bd051cebccb22a890471e480bceb7..f599fbaec46b569d17f2d4b463ac25c9aac86161 100644 (file)
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
  
  ### Considerations
  
-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
  
  ### Build llama.cpp and install to C:\LlamaCPP directory
  
diff --git a/examples/main/README.md b/examples/main/README.md

index 6a8d1e1c50cbb52f69d099c0026c046a1ca98ee8..9c83fd3bf5b05d68acf89f446695409388d6efee 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
  
  -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
  -   `--verbose-prompt`: Print the prompt before generating text.
--   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
--   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
--   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
  -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
  -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/server/README.md b/examples/server/README.md

index 49121a460f8c3d684a6068db28759ecdc079e147..aadc73b4ba81fed839ef37741f9d3cc52196e4be 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
  - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
  - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
  - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
-- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
  - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
  - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
  - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 338e60f28d62555dd53fcdb7f416073c0a9e2b45..c4c545c3e0ac4e42c82ff9052d609b7e984152fd 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2510,15 +2510,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                  invalid_param = true;
                  break;
              }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA
          } else if (arg == "--tensor-split" || arg == "-ts") {
              if (++i >= argc) {
                  invalid_param = true;
                  break;
              }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
              std::string arg_next = argv[i];
  
              // split string by , and /
@@ -2535,17 +2535,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                  }
              }
  #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUDA
          } else if (arg == "--main-gpu" || arg == "-mg") {
              if (++i >= argc) {
                  invalid_param = true;
                  break;
              }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
              params.main_gpu = std::stoi(argv[i]);
  #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
  #endif
          } else if (arg == "--lora") {
              if (++i >= argc) {
diff --git a/ggml-backend.c b/ggml-backend.c

index 6026570ae95aa19c9996c76de63553efbb8b0f3a..402d86ef3ac8b206d3acf602ec6dfb854d1f1bc4 100644 (file)
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
      ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
  
      // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
      extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
      ggml_backend_cuda_reg_devices();
  #endif
diff --git a/ggml.c b/ggml.c

index 203a9e54038d79176ade78850c223db894af5d67..62b8339599642bedb7b7ff737e2e6877d193259a 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -21674,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) {
  }
  
  int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
      return 1;
  #else
      return 0;
  #endif
  }
  
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+int ggml_cpu_has_cuda(void) {
+#if defined(GGML_USE_CUDA)
      return 1;
  #else
      return 0;
@@ -21722,7 +21722,7 @@ int ggml_cpu_has_sycl(void) {
  }
  
  int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
+    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
             ggml_cpu_has_sycl();
  }
  
diff --git a/ggml.h b/ggml.h

index 0a5af7205881556b50772334b664fc30fef47a47..c670caa6a3140389b0b246b35f67bae19c11a016 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -2354,7 +2354,7 @@ extern "C" {
      GGML_API int ggml_cpu_has_fp16_va    (void);
      GGML_API int ggml_cpu_has_wasm_simd  (void);
      GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_cuda       (void);
      GGML_API int ggml_cpu_has_clblast    (void);
      GGML_API int ggml_cpu_has_vulkan     (void);
      GGML_API int ggml_cpu_has_kompute    (void);
diff --git a/llama.cpp b/llama.cpp

index 61587cb7abf5a33233f97dd0724c3fe408f853cc..384021efdb47490777b3ff1c44b0212c5c5da7f3 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,7 +7,7 @@
  #include "ggml-alloc.h"
  #include "ggml-backend.h"
  
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
  #  include "ggml-cuda.h"
  #elif defined(GGML_USE_CLBLAST)
  #  include "ggml-opencl.h"
@@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
      ggml_backend_buffer_type_t buft = nullptr;
  
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
      // host buffers should only be used when data is expected to be copied to/from the GPU
      if (host_buffer) {
          buft = ggml_backend_cuda_host_buffer_type();
@@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
  
  #ifdef GGML_USE_METAL
      buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
      buft = ggml_backend_cuda_buffer_type(gpu);
  #elif defined(GGML_USE_VULKAN)
      buft = ggml_backend_vk_buffer_type(gpu);
@@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
      ggml_backend_buffer_type_t buft = nullptr;
  
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
      if (ggml_backend_cuda_get_device_count() > 1) {
          buft = ggml_backend_cuda_split_buffer_type(tensor_split);
      }
@@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
  }
  
  static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
      return ggml_backend_cuda_get_device_count();
  #elif defined(GGML_USE_SYCL)
      return ggml_backend_sycl_get_device_count();
@@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() {
  }
  
  static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
      size_t total;
      size_t free;
      ggml_backend_cuda_get_device_memory(device, &total, &free);
@@ -2080,7 +2080,7 @@ struct llama_model {
              ggml_free(ctx);
          }
          for (ggml_backend_buffer_t buf : bufs) {
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
              if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
                  ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
              }
@@ -5269,7 +5269,7 @@ static bool llm_load_tensors(
                  }
                  model.bufs.push_back(buf);
                  bufs.emplace(idx, buf);
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
                  if (n_layer >= n_gpu_layers) {
                      ggml_backend_cuda_register_host_buffer(
                          ggml_backend_buffer_get_base(buf),
@@ -13371,7 +13371,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
  size_t llama_max_devices(void) {
  #if defined(GGML_USE_METAL)
      return 1;
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
      return GGML_CUDA_MAX_DEVICES;
  #elif defined(GGML_USE_SYCL)
      return GGML_SYCL_MAX_DEVICES;
@@ -13391,8 +13391,8 @@ bool llama_supports_mlock(void) {
  }
  
  bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL)   || defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
+    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
      // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
      return true;
  #else
@@ -13597,7 +13597,7 @@ struct llama_context * llama_new_context_with_model(
              }
              ctx->backends.push_back(ctx->backend_metal);
          }
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
          if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
              // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
              ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@@ -13744,7 +13744,7 @@ struct llama_context * llama_new_context_with_model(
  
              // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
              bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
              // pipeline parallelism requires support for async compute and events
              // currently this is only implemented in the CUDA backend
              pipeline_parallel = false;
diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in

index 6a6d8e39ee013433186970d1348499b31a9e3fe4..f842c7137517cb602f96d0a382e1d8019edae3c8 100644 (file)
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
  set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
  set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
  set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
+set(LLAMA_CUDA @LLAMA_CUDA@)
  set(LLAMA_METAL @LLAMA_METAL@)
  set(LLAMA_MPI @LLAMA_MPI@)
  set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
@@ -27,7 +27,7 @@ if (LLAMA_BLAS)
      find_package(BLAS REQUIRED)
  endif()
  
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
      find_package(CUDAToolkit REQUIRED)
  endif()
  
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh

index 331c4b9ce9e918e50595f7a7ef70dece87a0107b..d1272506cd58a2caaf6b6185704078c89037b978 100755 (executable)
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -23,7 +23,7 @@ fi
  make_opts=""
  
  if [[ "$backend" == "cuda" ]]; then
-    make_opts="LLAMA_CUBLAS=1"
+    make_opts="LLAMA_CUDA=1"
  fi
  
  git checkout $1
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh

index 6cf1ab4f352a6d1fa3b06867a3ba8184b38e0e96..2058ceabf9730c8bbedc8b8b4a99295545067257 100644 (file)
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp
  
  cd llama.cpp
  
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
  
  ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
  ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
@@ -60,7 +60,7 @@ cd /workspace/llama.cpp
  mkdir build-cublas
  cd build-cublas
  
-cmake -DLLAMA_CUBLAS=1 ../
+cmake -DLLAMA_CUDA=1 ../
  make -j
  
  if [ "$1" -eq "0" ]; then
@@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then
      # batched
      cd /workspace/llama.cpp
  
-    LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
+    LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
  
      # batched-bench
      cd /workspace/llama.cpp
  
-    LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
+    LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
  
      # parallel
      cd /workspace/llama.cpp
  
-    LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
+    LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
  
  fi
  
@@ -204,10 +204,10 @@ fi
  #if [ "$1" -eq "7" ]; then
  #    cd /workspace/llama.cpp
  #
-#    LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
+#    LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
  #fi
  
  # more benches
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
  
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh

index 30bbac3215f9669e5198e93104f49e8d5e39357f..eb6ce458e5399b8245284a73a031cfdb4569fa0c 100644 (file)
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -380,7 +380,7 @@ fi
  
  if [[ "$backend" == "cuda" ]]; then
      printf "[+] Building with CUDA backend\n"
-    LLAMA_CUBLAS=1 make -j server $log
+    LLAMA_CUDA=1 make -j server $log
  elif [[ "$backend" == "cpu" ]]; then
      printf "[+] Building with CPU backend\n"
      make -j server $log
author	slaren <redacted>
	Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)
committer	GitHub <redacted>
	Tue, 26 Mar 2024 00:16:01 +0000 (01:16 +0100)
.devops/full-cuda.Dockerfile		patch \| blob \| history
.devops/llama-cpp-cublas.srpm.spec	[deleted file]	patch \| blob \| history
.devops/llama-cpp-cuda.srpm.spec	[new file with mode: 0644]	patch \| blob
.devops/main-cuda.Dockerfile		patch \| blob \| history
.devops/nix/package.nix		patch \| blob \| history
.devops/server-cuda.Dockerfile		patch \| blob \| history
.github/workflows/build.yml		patch \| blob \| history
CMakeLists.txt		patch \| blob \| history
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
ci/run.sh		patch \| blob \| history
common/common.cpp		patch \| blob \| history
docs/token_generation_performance_tips.md		patch \| blob \| history
examples/imatrix/README.md		patch \| blob \| history
examples/llama-bench/llama-bench.cpp		patch \| blob \| history
examples/llava/MobileVLM-README.md		patch \| blob \| history
examples/llava/clip.cpp		patch \| blob \| history
examples/main-cmake-pkg/README.md		patch \| blob \| history
examples/main/README.md		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
ggml-backend.c		patch \| blob \| history
ggml.c		patch \| blob \| history
ggml.h		patch \| blob \| history
llama.cpp		patch \| blob \| history
scripts/LlamaConfig.cmake.in		patch \| blob \| history
scripts/compare-commits.sh		patch \| blob \| history
scripts/pod-llama.sh		patch \| blob \| history
scripts/server-llm.sh		patch \| blob \| history