Include server in releases + other build system cleanups (#1610)

author Kerfuffle <redacted>

Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)

committer GitHub <redacted>

Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)
author Kerfuffle <redacted>
Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)
committer GitHub <redacted>
Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml

index 41f2dee2806c09fea65fe729ab208b29065fddcd..c98cbcbbebd0c65aab2d501297b8d12b61a42533 100644 (file)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
    push:
      branches:
        - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
    pull_request:
      types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
  
  env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -157,15 +157,15 @@ jobs:
        matrix:
          include:
            - build: 'avx2'
-            defines: ''
+            defines: '-DLLAMA_BUILD_SERVER=ON'
            - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
            - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
            - build: 'clblast'
-            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
            - build: 'openblas'
-            defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
  
      steps:
        - name: Clone
@@ -292,7 +292,7 @@ jobs:
          run: |
            mkdir build
            cd build
-          cmake .. -DLLAMA_CUBLAS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
            cmake --build . --config Release
  
        - name: Get commit hash
diff --git a/Makefile b/Makefile

index 804307b531703a28c987dbe7d78daabf0327bbbb..70bd5e90af9d86db3117d3756b1dd17b1ef4f3f3 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
  # Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+
+ifdef LLAMA_BUILD_SERVER
+       BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)
  
  ifndef UNAME_S
  UNAME_S := $(shell uname -s)
@@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
         $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
  
  clean:
-       rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+       rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
  
  #
  # Examples
@@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
  save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
         $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
  
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+       $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
  build-info.h: $(wildcard .git/index) scripts/build-info.sh
         @sh scripts/build-info.sh > $@.tmp
         @if ! cmp -s $@.tmp $@; then \
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 7209a2b5232f03e33517ef02ca8f82398cf4af2e..3904412cb932d8a78df54e989bccbc85667e6781 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,7 @@ struct llama_server_context
      std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
      // compare the evaluated prompt with the new prompt
      int new_prompt_len = 0;
-    for (int i = 0;i < prompt_tokens.size(); i++) {
+    for (size_t i = 0; i < prompt_tokens.size(); i++) {
        if (i < processed_tokens.size() &&
          processed_tokens[i] == prompt_tokens[i])
        {
@@ -71,7 +71,7 @@ struct llama_server_context
        {
          embd_inp.push_back(prompt_tokens[i]);
          if(new_prompt_len == 0) {
-          if(i - 1 < n_past) {
+          if(int32_t(i) - 1 < n_past) {
              processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
            }
            // Evaluate the new fragment prompt from the last token processed.
@@ -136,7 +136,7 @@ struct llama_server_context
      {
        // out of user input, sample next token
        const float temp = params.temp;
-      const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
        const float top_p = params.top_p;
        const float tfs_z = params.tfs_z;
        const float typical_p = params.typical_p;
@@ -306,12 +306,12 @@ struct llama_server_context
      // Avoid add the no show words to the response
      for (std::vector<llama_token> word_tokens : no_show_words)
      {
-      int match_token = 1;
+      size_t match_token = 1;
        if (tokens_predicted.front() == word_tokens.front())
        {
          bool execute_matching = true;
          if (tokens_predicted.size() > 1) { // if previus tokens had been tested
-          for (int i = 1; i < word_tokens.size(); i++)
+          for (size_t i = 1; i < word_tokens.size(); i++)
            {
              if (i >= tokens_predicted.size()) {
                match_token = i;
@@ -601,7 +601,7 @@ int main(int argc, char **argv)
  
    Server svr;
  
-  svr.Get("/", [](const Request &req, Response &res)
+  svr.Get("/", [](const Request &, Response &res)
            { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
  
    svr.Post("/completion", [&llama](const Request &req, Response &res)
@@ -649,7 +649,7 @@ int main(int argc, char **argv)
                        {"tokens_predicted", llama.num_tokens_predicted}};
                    return res.set_content(data.dump(), "application/json");
                  }
-                catch (json::exception e)
+                catch (const json::exception &e)
                  {
                    // Some tokens have bad UTF-8 strings, the json parser is very sensitive
                    json data = {
@@ -701,7 +701,7 @@ int main(int argc, char **argv)
                          {"content", result },
                          {"stop", !llama.has_next_token }};
                return res.set_content(data.dump(), "application/json");
-            } catch (json::exception e) {
+            } catch (const json::exception &e) {
                // Some tokens have bad UTF-8 strings, the json parser is very sensitive
                json data = {
                          {"content", "" },
author	Kerfuffle <redacted>
	Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)
committer	GitHub <redacted>
	Sat, 27 May 2023 17:04:14 +0000 (11:04 -0600)
.github/workflows/build.yml		patch \| blob \| history
Makefile		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history