#
# keep standard at C11 and C++11
+MK_CPPFLAGS = -I. -Icommon
+MK_CFLAGS = -std=c11 -fPIC
+MK_CXXFLAGS = -std=c++11 -fPIC
+
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
-OPT = -Ofast
+MK_CFLAGS += -Ofast
+MK_HOST_CXXFLAGS += -Ofast
+MK_CUDA_CXXFLAGS += -O3
else
-OPT = -O3
+MK_CFLAGS += -O3
+MK_CXXFLAGS += -O3
endif
-MK_CPPFLAGS = -I. -Icommon
-MK_CFLAGS = $(OPT) -std=c11 -fPIC
-MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
-MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native
- MK_CXXFLAGS += -march=native -mtune=native
+ MK_HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
- $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
+ $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides
-override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
-override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
+override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
+override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
+override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
+
+# save CXXFLAGS before we add host-only options
+NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
+override CXXFLAGS += $(HOST_CXXFLAGS)
#
# Print build information
#
$(info I llama.cpp build info: )
-$(info I UNAME_S: $(UNAME_S))
-$(info I UNAME_P: $(UNAME_P))
-$(info I UNAME_M: $(UNAME_M))
-$(info I CFLAGS: $(CFLAGS))
-$(info I CXXFLAGS: $(CXXFLAGS))
-$(info I LDFLAGS: $(LDFLAGS))
-$(info I CC: $(CCV))
-$(info I CXX: $(CXXV))
+$(info I UNAME_S: $(UNAME_S))
+$(info I UNAME_P: $(UNAME_P))
+$(info I UNAME_M: $(UNAME_M))
+$(info I CFLAGS: $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I NVCCFLAGS: $(NVCCFLAGS))
+$(info I LDFLAGS: $(LDFLAGS))
+$(info I CC: $(CCV))
+$(info I CXX: $(CXXV))
$(info )
#
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#else
} while (0)
#endif // CUDART_VERSION >= 11
+#if CUDART_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
#ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float
typedef half2 dfloat2;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_0;
const int kqsx = k % QI4_0;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_1;
const int kqsx = k % QI4_1;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_0;
const int kqsx = k % QI5_0;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_1;
const int kqsx = k % QI5_1;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI8_0;
const int kqsx = k % QI8_0;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI2_K;
const int kqsx = k % QI2_K;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI3_K;
const int kqsx = k % QI3_K;
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_K; // == 0 if QK_K == 256
const int kqsx = k % QI4_K; // == k if QK_K == 256
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_K; // == 0 if QK_K == 256
const int kqsx = k % QI5_K; // == k if QK_K == 256
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- __builtin_assume(i_offset >= 0);
- __builtin_assume(i_offset < nwarps);
- __builtin_assume(k >= 0);
- __builtin_assume(k < WARP_SIZE);
+ GGML_CUDA_ASSUME(i_offset >= 0);
+ GGML_CUDA_ASSUME(i_offset < nwarps);
+ GGML_CUDA_ASSUME(k >= 0);
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI6_K; // == 0 if QK_K == 256
const int kqsx = k % QI6_K; // == k if QK_K == 256
// wait for main GPU data if necessary
if (split && (id != g_main_device || is != 0)) {
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
}
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) {
for (int64_t is = 0; is < is_max; ++is) {
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
}
}
}