(cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
- (cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
]
cd build
ctest -L main --verbose --timeout 900
- ubuntu-latest-cmake-mpi:
- runs-on: ubuntu-latest
-
- continue-on-error: true
-
- strategy:
- matrix:
- mpi_library: [mpich, libopenmpi-dev]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential ${{ matrix.mpi_library }}
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake -DLLAMA_MPI=ON ..
- cmake --build . --config Release -j $(nproc)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose
-
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
"llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
-option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
endif()
endif()
-if (LLAMA_MPI)
- cmake_minimum_required(VERSION 3.10)
- find_package(MPI)
- if (MPI_C_FOUND)
- message(STATUS "MPI found")
-
- set(GGML_HEADERS_MPI ggml-mpi.h)
- set(GGML_SOURCES_MPI ggml-mpi.c)
-
- add_compile_definitions(GGML_USE_MPI)
- add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-
- if (NOT MSVC)
- add_compile_options(-Wno-cast-qual)
- endif()
-
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
- set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-
- # Even if you're only using the C header, C++ programs may bring in MPI
- # C++ functions, so more linkage is needed
- if (MPI_CXX_FOUND)
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
- endif()
- else()
- message(WARNING "MPI not found")
- endif()
-endif()
-
if (LLAMA_RPC)
add_compile_definitions(GGML_USE_RPC)
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
- ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
- "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
+ "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
endif
endif # LLAMA_NO_ACCELERATE
-ifdef LLAMA_MPI
- MK_CPPFLAGS += -DGGML_USE_MPI
- MK_CFLAGS += -Wno-cast-qual
- MK_CXXFLAGS += -Wno-cast-qual
- OBJS += ggml-mpi.o
-endif # LLAMA_MPI
-
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
endif
endif # LLAMA_METAL
-ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
- $(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_MPI
-
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
-### MPI Build
-
-MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
-
-First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
-
-Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
-
-- Using `make`:
-
- ```bash
- make CC=mpicc CXX=mpicxx LLAMA_MPI=1
- ```
-
-- Using `CMake`:
-
- ```bash
- cmake -S . -B build -DLLAMA_MPI=ON
- ```
-
-Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
-
-Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
-
-Here is an example hostfile:
-
-```
-192.168.0.1:2
-malvolio.local:1
-```
-
-The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
-
-Finally, you're ready to run a computation using `mpirun`:
-
-```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
-```
-
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+++ /dev/null
-#include "ggml-mpi.h"
-
-#include "ggml.h"
-
-#include <mpi.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define UNUSED GGML_UNUSED
-
-struct ggml_mpi_context {
- int rank;
- int size;
-};
-
-void ggml_mpi_backend_init(void) {
- MPI_Init(NULL, NULL);
-}
-
-void ggml_mpi_backend_free(void) {
- MPI_Finalize();
-}
-
-struct ggml_mpi_context * ggml_mpi_init(void) {
- struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
-
- MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
- MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
-
- return ctx;
-}
-
-void ggml_mpi_free(struct ggml_mpi_context * ctx) {
- free(ctx);
-}
-
-int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
- return ctx->rank;
-}
-
-void ggml_mpi_eval_init(
- struct ggml_mpi_context * ctx_mpi,
- int * n_tokens,
- int * n_past,
- int * n_threads) {
- UNUSED(ctx_mpi);
-
- // synchronize the worker node parameters with the root node
- MPI_Barrier(MPI_COMM_WORLD);
-
- MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
- MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
- MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
-}
-
-static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
- struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
- if (t == NULL) {
- fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
- return -1;
- }
-
- for (int i = 0; i < gf->n_nodes; i++) {
- if (gf->nodes[i] == t) {
- return i;
- }
- }
-
- fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
- return -1;
-}
-
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
- MPI_Datatype mpi_type;
-
- switch (t->type) {
- case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
- case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
- default: GGML_ASSERT(false && "not implemented");
- }
-
- const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
- GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
- MPI_Datatype mpi_type;
-
- switch (t->type) {
- case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
- case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
- default: GGML_ASSERT(false && "not implemented");
- }
-
- MPI_Status status; UNUSED(status);
-
- const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
- GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
- struct ggml_mpi_context * ctx_mpi,
- struct ggml_cgraph * gf,
- int n_layers) {
- const int mpi_rank = ctx_mpi->rank;
- const int mpi_size = ctx_mpi->size;
-
- struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
- if (inp_tokens == NULL) {
- fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
- return;
- }
-
- struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
- if (inp0 == NULL) {
- fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
- return;
- }
-
- GGML_ASSERT(inp0 == gf->nodes[0]);
-
- // distribute the compute graph into slices across the MPI nodes
- //
- // the main node (0) processes the last layers + the remainder of the compute graph
- // and is responsible to pass the input tokens to the first node (1)
- //
- // node 1: [( 0) * n_per_node, ( 1) * n_per_node)
- // node 2: [( 1) * n_per_node, ( 2) * n_per_node)
- // ...
- // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
- // node 0: [(n-1) * n_per_node, n_nodes)
- //
- if (mpi_rank > 0) {
- if (mpi_rank == 1) {
- // the first node (1) receives the input tokens from the main node (0)
- ggml_mpi_tensor_recv(inp_tokens, 0);
- } else {
- // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
- ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
- }
- } else if (mpi_size > 1) {
- // node 0 sends the input tokens to node 1
- ggml_mpi_tensor_send(inp_tokens, 1);
-
- // recv the output data from the last node
- ggml_mpi_tensor_recv(inp0, mpi_size - 1);
- }
-
- {
- const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
-
- const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
- const int il0 = (mpi_idx + 0) * n_per_node;
- const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-
- char name_l0[GGML_MAX_NAME];
- char name_l1[GGML_MAX_NAME];
-
- snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
- snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
-
- const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
- const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
- if (idx_l0 < 0 || idx_l1 < 0) {
- fprintf(stderr, "%s: layer input nodes not found\n", __func__);
- return;
- }
-
- // attach the input data to all nodes that need it
- // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
- for (int i = idx_l0; i < idx_l1; i++) {
- if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
- gf->nodes[i]->src[0] = inp0;
- }
- if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
- gf->nodes[i]->src[1] = inp0;
- }
- }
-
- // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
- for (int i = 1; i < idx_l1 - idx_l0; i++) {
- gf->nodes[i] = gf->nodes[idx_l0 + i];
- gf->grads[i] = gf->grads[idx_l0 + i];
- }
-
- // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
- if (mpi_idx != 0) {
- gf->nodes[0]->op = GGML_OP_NONE;
- }
-
- gf->n_nodes = idx_l1 - idx_l0;
-
- //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
- }
-}
-
-void ggml_mpi_graph_compute_post(
- struct ggml_mpi_context * ctx_mpi,
- struct ggml_cgraph * gf,
- int n_layers) {
- UNUSED(n_layers);
-
- const int mpi_rank = ctx_mpi->rank;
- const int mpi_size = ctx_mpi->size;
-
- // send the output data to the next node
- if (mpi_rank > 0) {
- ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
- }
-}
+++ /dev/null
-#pragma once
-
-struct ggml_context;
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_mpi_context;
-
-void ggml_mpi_backend_init(void);
-void ggml_mpi_backend_free(void);
-
-struct ggml_mpi_context * ggml_mpi_init(void);
-void ggml_mpi_free(struct ggml_mpi_context * ctx);
-
-int ggml_mpi_rank(struct ggml_mpi_context * ctx);
-
-void ggml_mpi_eval_init(
- struct ggml_mpi_context * ctx_mpi,
- int * n_tokens,
- int * n_past,
- int * n_threads);
-
-void ggml_mpi_graph_compute_pre(
- struct ggml_mpi_context * ctx_mpi,
- struct ggml_cgraph * gf,
- int n_layers);
-
-void ggml_mpi_graph_compute_post(
- struct ggml_mpi_context * ctx_mpi,
- struct ggml_cgraph * gf,
- int n_layers);
-
-#ifdef __cplusplus
-}
-#endif
#ifdef GGML_USE_METAL
# include "ggml-metal.h"
#endif
-#ifdef GGML_USE_MPI
-# include "ggml-mpi.h"
-#endif
#ifndef QK_K
# ifdef GGML_QKK_64
# define QK_K 64
// control vectors
struct llama_control_vector cvec;
-
-#ifdef GGML_USE_MPI
- ggml_mpi_context * ctx_mpi = NULL;
-#endif
};
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
} else {
-#ifdef GGML_USE_MPI
- GGML_ASSERT(false && "not implemented");
-#endif
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
inpL = lctx.inp_embd;
ggml_set_input(lctx.inp_embd);
}
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
-#ifdef GGML_USE_MPI
- const int64_t n_layer = lctx.model.hparams.n_layer;
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
-
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-#ifdef GGML_USE_MPI
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
}
// decode a batch of tokens by evaluating the transformer
}
lctx.n_queued_tokens += n_tokens_all;
-#ifdef GGML_USE_MPI
- // TODO: needs fix after #3228
- GGML_ASSERT(false && "not implemented");
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
-
auto & kv_self = lctx.kv_self;
const int64_t n_embd = hparams.n_embd;
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
-
-#ifdef GGML_USE_MPI
- ggml_mpi_backend_init();
-#endif
}
void llama_numa_init(enum ggml_numa_strategy numa) {
}
void llama_backend_free(void) {
-#ifdef GGML_USE_MPI
- ggml_mpi_backend_free();
-#endif
ggml_quantize_free();
}
}
}
-#ifdef GGML_USE_MPI
- ctx->ctx_mpi = ggml_mpi_init();
-
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
- // TODO: needs fix after #3228
- GGML_ASSERT(false && "not implemented");
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
- llama_backend_free();
- exit(1);
- }
-#endif
-
return ctx;
}
set(LLAMA_BLAS @LLAMA_BLAS@)
set(LLAMA_CUDA @LLAMA_CUDA@)
set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_MPI @LLAMA_MPI@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
-if (LLAMA_MPI)
- find_package(MPI REQUIRED)
-endif()
-
if (LLAMA_CLBLAST)
find_package(CLBlast REQUIRED)
endif()