endif
endif
+ifdef LLAMA_RPC
+ BUILD_TARGETS += rpc-server
+endif
+
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
+ifdef LLAMA_RPC
+ MK_CPPFLAGS += -DGGML_USE_RPC
+ OBJS += ggml-rpc.o
+endif # LLAMA_RPC
+
ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif
endif # LLAMA_METAL
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
+ifdef LLAMA_RPC
+ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # LLAMA_RPC
+
GF_CC := $(CC)
include scripts/get-flags.mk
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
-
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
// this tensor was allocated without ggml-backend
return;
}
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
+ ggml_backend_view_init(tensor);
}
} else {
if (tensor->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
- ggml_backend_view_init(buffer, t);
+ ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
- ggml_backend_view_init(buffer, t);
+ ggml_backend_view_init(t);
}
}
}
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (dst_buf->iface.cpy_tensor) {
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
}
return false;
}
// utils
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+void ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
- tensor->buffer = buffer;
+ tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
- ggml_backend_buffer_init_tensor(buffer, tensor);
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
}
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
- ggml_backend_view_init(dst->view_src->buffer, dst);
+ ggml_backend_view_init(dst);
}
else {
ggml_backend_tensor_copy(src, dst);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
#ifdef __cplusplus
if (remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface,
- new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
+ new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
remote_size);
return buffer;
} else {
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
/* .endpoint = */ endpoint,
- /* .name = */ "RPC",
+ /* .name = */ "RPC[" + std::string(endpoint) + "]",
};
ggml_backend_t backend = new ggml_backend {
struct llama_control_vector cvec;
};
+static size_t llama_get_device_count(const llama_model & model) {
+ size_t count = 1;
+#if defined(GGML_USE_CUDA)
+ count = ggml_backend_cuda_get_device_count();
+#elif defined(GGML_USE_SYCL)
+ count = ggml_backend_sycl_get_device_count();
+#elif defined(GGML_USE_VULKAN)
+ count = ggml_backend_vk_get_device_count();
+#endif
+#if defined(GGML_USE_RPC)
+ count += model.rpc_servers.size();
+#endif
+ return count;
+ GGML_UNUSED(model);
+}
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
-#ifdef GGML_USE_RPC
- std::string endpoint = model.rpc_servers[gpu];
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
-#elif defined(GGML_USE_METAL)
+#if defined(GGML_USE_RPC)
+ int dev_count = (int)llama_get_device_count(model);
+ int rpc_count = (int)model.rpc_servers.size();
+ if (gpu >= dev_count - rpc_count) {
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
+ return ggml_backend_rpc_buffer_type(endpoint);
+ }
+#endif
+#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
GGML_UNUSED(tensor_split);
}
-static size_t llama_get_device_count(const llama_model & model) {
-#if defined(GGML_USE_RPC)
- return model.rpc_servers.size();
-#elif defined(GGML_USE_CUDA)
- return ggml_backend_cuda_get_device_count();
-#elif defined(GGML_USE_SYCL)
- return ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
- return ggml_backend_vk_get_device_count();
-#else
- return 1;
-#endif
- GGML_UNUSED(model);
-}
-
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
- size_t total;
- size_t free;
- std::string endpoint = model.rpc_servers[device];
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
- return free;
-#elif defined(GGML_USE_CUDA)
+ int dev_count = (int)llama_get_device_count(model);
+ int rpc_count = (int)model.rpc_servers.size();
+ if (device >= dev_count - rpc_count) {
+ size_t total;
+ size_t free;
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
+ return free;
+ }
+#endif
+#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return true;
};
}
- if (params.rpc_servers != nullptr) {
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
if (!hparams.vocab_only) {
// initialize backends
-#if defined(GGML_USE_RPC)
- for (auto & server : model->rpc_servers) {
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
-#elif defined(GGML_USE_METAL)
+#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
}
ctx->backends.push_back(backend);
}
+#endif
+#if defined(GGML_USE_RPC)
+ if (model->n_gpu_layers > 0) {
+ for (const auto & endpoint : model->rpc_servers) {
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
+ }
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {